From 8ddf19e141a8cf41a7e16de60c3849145b6960f4 Mon Sep 17 00:00:00 2001 From: czoli1976 <64466170+czoli1976@users.noreply.github.com> Date: Tue, 28 Apr 2026 07:20:42 +0100 Subject: [PATCH 01/10] DFN3 WASM optimization: bump tract 0.21 -> 0.22.1, route tract-linalg via patched fork Captures the libDF-side changes that land Vonage's DFN3 WASM kernel investigation (RTF 0.1290 -> 0.0516, -60% / 2.5x faster, audio bit-identical). Changes: 1. tract bump 0.21.4 -> 0.22.1 in libDF/Cargo.toml. Required for the newer tract-linalg architecture that the kernel kit targets. 2. ndarray re-import via tract_core. tract 0.22.1 vendors ndarray under tract_core::ndarray; the older bare `use ndarray::prelude::*;` no longer resolves cleanly under the bumped dep. Updated: - libDF/src/bin/enhance_wav.rs - libDF/src/tract.rs - libDF/src/transforms.rs - libDF/src/wasm.rs - libDF/src/wav_utils.rs 3. m.symbol_table.sym('S') -> m.symbols.sym('S') in libDF/src/tract.rs (3 sites: encoder + erb_decoder + df_decoder). API rename in tract 0.22.1. 4. Workspace [patch.crates-io] override pointing tract-linalg at a local fork that adds six WASM SIMD kernels (4x4 existing + 4x1, 8x1, 16x1, 8x4, 8x8 new) plus a per-M dispatcher in Ops::mmv_f32. Source for the kernels: czoli1976/tract@add-wasm-f32-full-kernel-kit. 5. Cargo.lock updated to reflect deps changes. Production builds must set RUSTFLAGS=\"-C target-feature=+simd128\" (Discovery #1 from the investigation: simd128 was never on in production builds, which kept tract-linalg/src/wasm.rs cfg-gated out and forced the scalar generic_f32_4x4 path, costing 16% RTF). See WASM_SIMD_KERNEL_INVESTIGATION.md in the dfn3-wasm-opt-v2.1 worktree for the full investigation log + measurements. --- Cargo.lock | 1720 ++++++++++++++++++---------------- Cargo.toml | 9 + libDF/Cargo.toml | 8 +- libDF/src/bin/enhance_wav.rs | 2 +- libDF/src/tract.rs | 8 +- libDF/src/transforms.rs | 2 +- libDF/src/wasm.rs | 2 +- libDF/src/wav_utils.rs | 2 +- 8 files changed, 938 insertions(+), 815 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 886c82549..05c665bb2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9,7 +9,7 @@ dependencies = [ "crossbeam-channel", "deep_filter", "log", - "ndarray", + "ndarray 0.15.6", "numpy", "pyo3", ] @@ -19,7 +19,7 @@ name = "DeepFilterLib" version = "0.5.7-pre" dependencies = [ "deep_filter", - "ndarray", + "ndarray 0.15.6", "numpy", "pyo3", ] @@ -62,6 +62,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", + "getrandom", "once_cell", "version_check", "zerocopy", @@ -111,20 +112,23 @@ dependencies = [ [[package]] name = "android-activity" -version = "0.4.3" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64529721f27c2314ced0890ce45e469574a73e5e6fdd6e9da1860eb29285f5e0" +checksum = "ee91c0c2905bae44f84bfa4e044536541df26b7703fd0888deeb9060fcc44289" dependencies = [ "android-properties", - "bitflags 1.3.2", + "bitflags 2.5.0", "cc", + "cesu8", + "jni", "jni-sys", "libc", "log", - "ndk 0.7.0", + "ndk 0.8.0", "ndk-context", - "ndk-sys 0.4.1+23.1.7779620", - "num_enum 0.6.1", + "ndk-sys 0.5.0+25.2.9519653", + "num_enum", + "thiserror", ] [[package]] @@ -206,6 +210,12 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c" +[[package]] +name = "anymap3" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "170433209e817da6aae2c51aa0dd443009a613425dd041ebfb2492d1c4c11a25" + [[package]] name = "approx" version = "0.5.1" @@ -227,6 +237,12 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +[[package]] +name = "as-raw-xcb-connection" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175571dd1d178ced59193a6fc02dde1b972eb0bc56c892cde9beeceac5bf0f6b" + [[package]] name = "ascii" version = "1.1.0" @@ -374,7 +390,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -409,7 +425,7 @@ checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -448,7 +464,7 @@ dependencies = [ "bitflags 2.5.0", "cexpr", "clang-sys", - "itertools 0.12.1", + "itertools 0.10.5", "lazy_static", "lazycell", "proc-macro2", @@ -456,7 +472,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -515,21 +531,21 @@ dependencies = [ [[package]] name = "block-sys" -version = "0.1.0-beta.1" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa55741ee90902547802152aaf3f8e5248aab7e21468089560d4c8840561146" +checksum = "ae85a0696e7ea3b835a453750bf002770776609115e6d25c6d2ff28a8200f7e7" dependencies = [ "objc-sys", ] [[package]] name = "block2" -version = "0.2.0-alpha.6" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8dd9e63c1744f755c2f60332b88de39d341e5e86239014ad839bd71c106dec42" +checksum = "15b55663a85f33501257357e6421bb33e769d5c9ffb5ba0921c975a123e35e68" dependencies = [ "block-sys", - "objc2-encode", + "objc2 0.4.1", ] [[package]] @@ -575,7 +591,7 @@ checksum = "4da9a32f3fed317401fa3c862968128267c3106685286e15d5aaa3d7389c2f60" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -590,20 +606,6 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" -[[package]] -name = "calloop" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52e0d00eb1ea24371a97d2da6201c6747a633dc6dc1988ef503403b4c59504a8" -dependencies = [ - "bitflags 1.3.2", - "log", - "nix 0.25.1", - "slotmap", - "thiserror", - "vec_map 0.8.2", -] - [[package]] name = "calloop" version = "0.12.4" @@ -624,10 +626,10 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0f0ea9b9476c7fad82841a8dbb380e2eae480c21910feba80725b46931ed8f02" dependencies = [ - "calloop 0.12.4", + "calloop", "rustix 0.38.34", - "wayland-backend 0.3.3", - "wayland-client 0.31.2", + "wayland-backend", + "wayland-client", ] [[package]] @@ -653,7 +655,7 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" dependencies = [ - "nom", + "nom 7.1.3", ] [[package]] @@ -710,7 +712,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -727,13 +729,11 @@ checksum = "4bfbf56724aa9eca8afa4fcfadeb479e722935bb2a0900c2d37e0cc477af0688" [[package]] name = "clipboard-win" -version = "4.5.0" +version = "5.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7191c27c2357d9b7ef96baac1773290d4ca63b24205b82a3fd8a0637afcf0362" +checksum = "bde03770d3df201d4fb868f2c9c59e66a3e4e2bd06692a0fe701e7103c7e84d4" dependencies = [ "error-code", - "str-buf", - "winapi", ] [[package]] @@ -763,7 +763,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4274ea815e013e0f9f04a2633423e14194e408a0576c943ce3d14ca56c50031c" dependencies = [ "thiserror", - "x11rb 0.13.1", + "x11rb", ] [[package]] @@ -775,36 +775,6 @@ dependencies = [ "cc", ] -[[package]] -name = "cocoa" -version = "0.24.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f425db7937052c684daec3bd6375c8abe2d146dca4b8b143d6db777c39138f3a" -dependencies = [ - "bitflags 1.3.2", - "block", - "cocoa-foundation", - "core-foundation", - "core-graphics", - "foreign-types", - "libc", - "objc", -] - -[[package]] -name = "cocoa-foundation" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c6234cbb2e4c785b456c0644748b1ac416dd045799740356f8363dfe00c93f7" -dependencies = [ - "bitflags 1.3.2", - "block", - "core-foundation", - "core-graphics-types", - "libc", - "objc", -] - [[package]] name = "codespan-reporting" version = "0.11.1" @@ -828,10 +798,35 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" [[package]] -name = "com-rs" -version = "0.2.1" +name = "com" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e17887fd17353b65b1b2ef1c526c83e26cd72e74f598a8dc1bee13a48f3d9f6" +dependencies = [ + "com_macros", +] + +[[package]] +name = "com_macros" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d375883580a668c7481ea6631fc1a8863e33cc335bf56bfad8d7e6d4b04b13a5" +dependencies = [ + "com_macros_support", + "proc-macro2", + "syn 1.0.109", +] + +[[package]] +name = "com_macros_support" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf43edc576402991846b093a7ca18a3477e0ef9c588cde84964b5d3e43016642" +checksum = "ad899a1087a9296d5644792d7cb72b8e34c1bec8e7d4fbc002230169a6e8710c" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] [[package]] name = "combine" @@ -900,9 +895,9 @@ checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" [[package]] name = "core-graphics" -version = "0.22.3" +version = "0.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2581bbab3b8ffc6fcbd550bf46c355135d16e9ff2a6ea032ad6b9bf1d7efe4fb" +checksum = "c07782be35f9e1140080c6b96f0d44b739e2278479f64e02fdab4e32dfd8b081" dependencies = [ "bitflags 1.3.2", "core-foundation", @@ -944,16 +939,17 @@ dependencies = [ [[package]] name = "cosmic-text" -version = "0.9.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0b68966c2543609f8d92f9d33ac3b719b2a67529b0c6c0b3e025637b477eef9" +checksum = "75acbfb314aeb4f5210d379af45ed1ec2c98c7f1790bf57b8a4c562ac0c51b71" dependencies = [ - "aliasable", "fontdb", "libm", "log", "rangemap", + "rustc-hash", "rustybuzz", + "self_cell", "swash", "sys-locale", "unicode-bidi", @@ -1053,6 +1049,15 @@ dependencies = [ "typenum", ] +[[package]] +name = "ctor" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83cf0d42651b16c6dfe68685716d18480d18a9c39c62d76e8cf3eb6ed5d8bcbf" +dependencies = [ + "dtor", +] + [[package]] name = "ctrlc" version = "3.4.4" @@ -1071,12 +1076,12 @@ checksum = "96a6ac251f4a2aca6b3f91340350eab87ae57c3f127ffeb585e92bd336717991" [[package]] name = "d3d12" -version = "0.6.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8f0de2f5a8e7bd4a9eec0e3c781992a4ce1724f68aec7d7a3715344de8b39da" +checksum = "3e3d747f100290a1ca24b752186f61f6637e1deffe3bf6320de6fcb29510a307" dependencies = [ - "bitflags 1.3.2", - "libloading 0.7.4", + "bitflags 2.5.0", + "libloading 0.8.3", "winapi", ] @@ -1095,7 +1100,7 @@ dependencies = [ "event-listener 2.5.3", "ladspa", "log", - "ndarray", + "ndarray 0.15.6", "uuid", "zbus", ] @@ -1120,7 +1125,7 @@ dependencies = [ "js-sys", "lewton", "log", - "ndarray", + "ndarray 0.15.6", "ndarray-rand", "num-complex", "ogg", @@ -1130,7 +1135,7 @@ dependencies = [ "realfft", "roots", "rstest", - "rubato", + "rubato 0.14.1", "rust-ini", "rustfft", "serde", @@ -1146,9 +1151,9 @@ dependencies = [ [[package]] name = "deranged" -version = "0.3.11" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ "powerfmt", ] @@ -1187,11 +1192,11 @@ dependencies = [ "env_logger 0.10.2", "iced", "image", - "itertools 0.11.0", + "itertools 0.12.1", "log", - "ndarray", + "ndarray 0.15.6", "ringbuf", - "rubato", + "rubato 0.15.0", ] [[package]] @@ -1210,6 +1215,16 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bd0c93bb4b0c6d9b77f4435b0ae98c24d17f1c45b2ff844c6151a07256ca923b" +[[package]] +name = "dispatch2" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38" +dependencies = [ + "bitflags 2.5.0", + "objc2 0.6.4", +] + [[package]] name = "dlib" version = "0.5.2" @@ -1228,24 +1243,70 @@ dependencies = [ "const-random", ] -[[package]] -name = "doc-comment" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" - [[package]] name = "downcast-rs" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2" +[[package]] +name = "drm" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80bc8c5c6c2941f70a55c15f8d9f00f9710ebda3ffda98075f996a0e6c92756f" +dependencies = [ + "bitflags 2.5.0", + "bytemuck", + "drm-ffi", + "drm-fourcc", + "libc", + "rustix 0.38.34", +] + +[[package]] +name = "drm-ffi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51a91c9b32ac4e8105dec255e849e0d66e27d7c34d184364fb93e469db08f690" +dependencies = [ + "drm-sys", + "rustix 1.1.4", +] + +[[package]] +name = "drm-fourcc" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aafbcdb8afc29c1a7ee5fbe53b5d62f4565b35a042a662ca9fecd0b54dae6f4" + +[[package]] +name = "drm-sys" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8e1361066d91f5ffccff060a3c3be9c3ecde15be2959c1937595f7a82a9f8" +dependencies = [ + "libc", + "linux-raw-sys 0.9.4", +] + +[[package]] +name = "dtor" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edf234dd1594d6dd434a8fb8cada51ddbbc593e40e4a01556a0b31c62da2775b" + [[package]] name = "dyn-clone" version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" +[[package]] +name = "dyn-hash" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15401da73a9ed8c80e3b2d4dc05fe10e7b72d7243b9f614e516a44fa99986e88" + [[package]] name = "either" version = "1.11.0" @@ -1270,7 +1331,7 @@ checksum = "5c785274071b1b420972453b306eeca06acf4633829db4223b58a2a8c5953bc4" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -1317,23 +1378,19 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.8" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] name = "error-code" -version = "2.3.1" +version = "3.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64f18991e7bf11e7ffee451b5318b5c1a73c52d0d0ada6e5a3017c8c1ced6a21" -dependencies = [ - "libc", - "str-buf", -] +checksum = "dea2df4cf52843e0452895c455a1a2cfbb842a1e7329671acf418fdc53ed4c59" [[package]] name = "etagere" @@ -1490,6 +1547,12 @@ dependencies = [ "spin", ] +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "font-types" version = "0.5.3" @@ -1499,14 +1562,24 @@ dependencies = [ "bytemuck", ] +[[package]] +name = "fontconfig-parser" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbc773e24e02d4ddd8395fd30dc147524273a83e54e0f312d986ea30de5f5646" +dependencies = [ + "roxmltree", +] + [[package]] name = "fontdb" -version = "0.14.1" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af8d8cbea8f21307d7e84bca254772981296f058a1d36b461bf4d83a7499fc9e" +checksum = "020e203f177c0fb250fb19455a252e838d2bbbce1f80f25ecc42402aafa8cd38" dependencies = [ + "fontconfig-parser", "log", - "memmap2 0.6.2", + "memmap2 0.8.0", "slotmap", "tinyvec", "ttf-parser 0.19.2", @@ -1514,18 +1587,30 @@ dependencies = [ [[package]] name = "foreign-types" -version = "0.3.2" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965" dependencies = [ + "foreign-types-macros", "foreign-types-shared", ] +[[package]] +name = "foreign-types-macros" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "foreign-types-shared" -version = "0.1.1" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" +checksum = "aa9a19cbb55df58761df49b23516a86d432839add4af60fc256da840f66ed35b" [[package]] name = "futures" @@ -1612,7 +1697,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -1661,16 +1746,6 @@ dependencies = [ "version_check", ] -[[package]] -name = "gethostname" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1ebd34e35c46e00bb73e81363248d627782724609fe1b6396f553f68fe3862e" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "gethostname" version = "0.4.3" @@ -1710,11 +1785,22 @@ version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +[[package]] +name = "gl_generator" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a95dfc23a2b4a9a2f5ab41d194f8bfda3cabec42af4e39f08c339eb2a0c124d" +dependencies = [ + "khronos_api", + "log", + "xml-rs", +] + [[package]] name = "glam" -version = "0.24.2" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5418c17512bdf42730f9032c74e1ae39afc408745ebb2acf72fbc4691c17945" +checksum = "151665d9be52f9bb40fc7966565d39666f2d1e69233571b71b87791c7e0528b3" [[package]] name = "glob" @@ -1724,9 +1810,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "glow" -version = "0.12.3" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca0fe580e4b60a8ab24a868bc08e2f03cbcb20d3d676601fa909386713333728" +checksum = "bd348e04c43b32574f2de31c8bb397d96c9fcfa1371bd4ca6d8bdc464ab121b1" dependencies = [ "js-sys", "slotmap", @@ -1734,11 +1820,20 @@ dependencies = [ "web-sys", ] +[[package]] +name = "glutin_wgl_sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c8098adac955faa2d31079b65dc48841251f69efd3ac25477903fc424362ead" +dependencies = [ + "gl_generator", +] + [[package]] name = "glyphon" -version = "0.3.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e87caa7459145f5e5f167bf34db4532901404c679e62339fb712a0e3ccf722a" +checksum = "6a62d0338e4056db6a73221c2fb2e30619452f6ea9651bac4110f51b0f7a7581" dependencies = [ "cosmic-text", "etagere", @@ -1748,34 +1843,34 @@ dependencies = [ [[package]] name = "gpu-alloc" -version = "0.5.4" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22beaafc29b38204457ea030f6fb7a84c9e4dd1b86e311ba0542533453d87f62" +checksum = "fbcd2dba93594b227a1f57ee09b8b9da8892c34d55aa332e034a228d0fe6a171" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.5.0", "gpu-alloc-types", ] [[package]] name = "gpu-alloc-types" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54804d0d6bc9d7f26db4eaec1ad10def69b599315f487d32c334a80d1efe67a5" +checksum = "98ff03b468aa837d70984d55f5d3f846f6ec31fe34bbb97c4f85219caeee1ca4" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.5.0", ] [[package]] name = "gpu-allocator" -version = "0.22.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce95f9e2e11c2c6fadfce42b5af60005db06576f231f5c92550fdded43c423e8" +checksum = "6f56f6318968d03c18e1bcf4857ff88c61157e9da8e47c5f29055d60e1228884" dependencies = [ - "backtrace", "log", + "presser", "thiserror", "winapi", - "windows 0.44.0", + "windows 0.52.0", ] [[package]] @@ -1821,30 +1916,35 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.12.3" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "hashbrown" -version = "0.14.5" +version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "ahash", "allocator-api2", + "equivalent", + "foldhash", ] [[package]] name = "hassle-rs" -version = "0.10.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1397650ee315e8891a0df210707f0fc61771b0cc518c3023896064c5407cb3b0" +checksum = "af2a7e73e1f34c48da31fb668a907f250794837e08faa144fd24f0b8b741e890" dependencies = [ - "bitflags 1.3.2", - "com-rs", + "bitflags 2.5.0", + "com", "libc", - "libloading 0.7.4", + "libloading 0.8.3", "thiserror", "widestring", "winapi", @@ -1862,8 +1962,8 @@ dependencies = [ "hdf5-types", "lazy_static", "libc", - "ndarray", - "parking_lot 0.12.2", + "ndarray 0.15.6", + "parking_lot 0.12.5", "paste", ] @@ -1875,7 +1975,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -1956,9 +2056,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "iced" -version = "0.10.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c708807ec86f99dd729dc4d42db5239acf118cec14d3c5f57679dcfdbbc472b1" +checksum = "7d4eb0fbbefb8c428b70680e77ed9013887b17c1d6be366b40f264f956d1a096" dependencies = [ "iced_core", "iced_futures", @@ -1971,23 +2071,27 @@ dependencies = [ [[package]] name = "iced_core" -version = "0.10.0" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64d0bc4fbf018576d08d93f838e6058cc6f10bbc05e04ae249a2a44dffb4ebc8" +checksum = "7d7e6bbd197f311ed3d8b71651876b0ce01318fde52cda862a9a7a4373c9b930" dependencies = [ - "bitflags 1.3.2", - "instant", + "bitflags 2.5.0", + "glam", "log", + "num-traits", "palette", + "raw-window-handle", + "smol_str", "thiserror", - "twox-hash", + "web-time", + "xxhash-rust", ] [[package]] name = "iced_futures" -version = "0.7.0" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14dab0054a9c7a1cbce227a8cd9ee4a094497b3d06094551ac6c1488d563802e" +checksum = "370bad88fb3832cbeeb3fa6c486b4701fb7e8da32a753b3101d4ce81fc1d9497" dependencies = [ "futures", "iced_core", @@ -1999,52 +2103,57 @@ dependencies = [ [[package]] name = "iced_graphics" -version = "0.9.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67ff14447a221e9e9205a13d84d7bbdf0636a3b1daa02cfca690ed09689c4d2b" +checksum = "6a044c193ef0840eacabfa05424717331d1fc5b3ecb9a89316200c75da2ba9a4" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.5.0", "bytemuck", - "glam", + "cosmic-text", "half", "iced_core", + "iced_futures", "image", "kamadak-exif", "log", + "once_cell", "raw-window-handle", + "rustc-hash", "thiserror", + "unicode-segmentation", + "xxhash-rust", ] [[package]] name = "iced_renderer" -version = "0.1.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1033385b0db0099a0d13178c9ff93c1ce11e7d0177522acf578bf79febdb2af8" +checksum = "5c281e03001d566058f53dec9325bbe61c62da715341206d2627f57a3ecc7f69" dependencies = [ "iced_graphics", "iced_tiny_skia", "iced_wgpu", "log", - "raw-window-handle", "thiserror", ] [[package]] name = "iced_runtime" -version = "0.1.1" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c6c89853e1250c6fac82c5015fa2144517be9b33d4b8e456f10e198b23e28bd" +checksum = "a79f852c01cc6d61663c94379cb3974ac3ad315a28c504e847d573e094f46822" dependencies = [ "iced_core", "iced_futures", + "raw-window-handle", "thiserror", ] [[package]] name = "iced_style" -version = "0.9.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d85c47d9d13e2281f75ddf98c865daf2101632bd2b855c401dd0b1c8b81a31a0" +checksum = "2ea42a740915d2a5a9ff9c3aa0bca28b16e9fb660bc8f675eed71d186cadb579" dependencies = [ "iced_core", "once_cell", @@ -2053,29 +2162,28 @@ dependencies = [ [[package]] name = "iced_tiny_skia" -version = "0.1.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7715f6222c9470bbbd75a39f70478fa0d1bdfb81a377a34fd1b090ffccc480b" +checksum = "8c2228781f4d381a1cbbd7905a9f077351aa8d37269094021d5d9e779f130aff" dependencies = [ "bytemuck", "cosmic-text", "iced_graphics", "kurbo", "log", - "raw-window-handle", "rustc-hash", "softbuffer", - "tiny-skia 0.10.0", - "twox-hash", + "tiny-skia", + "xxhash-rust", ] [[package]] name = "iced_wgpu" -version = "0.11.1" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "703f7c5de46b997ed7b18e05ec67059dcdf3beeac51e917c21071b021bb848b9" +checksum = "e3c243b6700452886aac1ee1987e84d9fb43b56b53fea9a1eb67713fd0fde244" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.5.0", "bytemuck", "futures", "glam", @@ -2084,17 +2192,14 @@ dependencies = [ "iced_graphics", "log", "once_cell", - "raw-window-handle", - "rustc-hash", - "twox-hash", "wgpu", ] [[package]] name = "iced_widget" -version = "0.1.3" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a177219ae51c3ba08f228ab932354b360cc669e94aec50c01e7c9b675f074c7c" +checksum = "7e01b2212adecf1cb80e2267f302c0e0c263e55f97812056949199ccf9f0b908" dependencies = [ "iced_renderer", "iced_runtime", @@ -2107,22 +2212,33 @@ dependencies = [ [[package]] name = "iced_winit" -version = "0.10.1" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad0c884bcb14722a57192b40a5ef6b5e170fa2f01fe2ff28d6cdd9efe37acf70" +checksum = "63f66831d0e399b93f631739121a6171780d344b275d56808b9504d8ca75c7d2" dependencies = [ "iced_graphics", "iced_runtime", "iced_style", "log", - "raw-window-handle", "thiserror", + "tracing", "web-sys", "winapi", "window_clipboard", "winit", ] +[[package]] +name = "icrate" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d3aaff8a54577104bafdf686ff18565c3b6903ca5782a2026ef06e2c7aa319" +dependencies = [ + "block2", + "dispatch", + "objc2 0.4.1", +] + [[package]] name = "image" version = "0.24.9" @@ -2141,16 +2257,6 @@ dependencies = [ "tiff", ] -[[package]] -name = "indexmap" -version = "1.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" -dependencies = [ - "autocfg", - "hashbrown 0.12.3", -] - [[package]] name = "indexmap" version = "2.2.6" @@ -2174,9 +2280,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ "cfg-if", - "js-sys", - "wasm-bindgen", - "web-sys", ] [[package]] @@ -2218,18 +2321,18 @@ dependencies = [ [[package]] name = "itertools" -version = "0.11.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" dependencies = [ "either", ] [[package]] name = "itertools" -version = "0.12.1" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" dependencies = [ "either", ] @@ -2320,15 +2423,21 @@ dependencies = [ [[package]] name = "khronos-egl" -version = "4.1.0" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c2352bd1d0bceb871cb9d40f24360c8133c11d7486b68b5381c1dd1a32015e3" +checksum = "6aae1df220ece3c0ada96b8153459b67eebe9ae9212258bb0134ae60416fdf76" dependencies = [ "libc", - "libloading 0.7.4", + "libloading 0.8.3", "pkg-config", ] +[[package]] +name = "khronos_api" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2db585e1d738fc771bf08a151420d3ed193d9d895a36df7f6f8a9456b911ddc" + [[package]] name = "kstring" version = "2.0.0" @@ -2341,11 +2450,12 @@ dependencies = [ [[package]] name = "kurbo" -version = "0.9.5" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd85a5776cd9500c2e2059c8c76c3b01528566b7fcbaf8098b55a33fc298849b" +checksum = "1618d4ebd923e97d67e7cd363d80aef35fe961005cbbbb3d2dad8bdd1bc63440" dependencies = [ "arrayvec", + "smallvec", ] [[package]] @@ -2356,14 +2466,14 @@ checksum = "6197e2fb8a3da99eca216e9689b47465b23cfe09e1a1ddc720fa1acdd54aa267" dependencies = [ "bitflags 0.8.2", "libc", - "vec_map 0.7.0", + "vec_map", ] [[package]] name = "lazy_static" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lazycell" @@ -2390,9 +2500,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.154" +version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae743338b92ff9146ce83992f766a31066a91a8c84a45e0e9f21e7cf6de6d346" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" [[package]] name = "libloading" @@ -2411,14 +2521,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" dependencies = [ "cfg-if", - "windows-targets 0.52.5", + "windows-targets 0.48.5", ] [[package]] name = "libm" -version = "0.2.8" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" [[package]] name = "libredox" @@ -2443,13 +2553,24 @@ version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +[[package]] +name = "linux-raw-sys" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + [[package]] name = "liquid" -version = "0.26.4" +version = "0.26.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69f68ae1011499ae2ef879f631891f21c78e309755f4a5e483c4a8f12e10b609" +checksum = "2a494c3f9dad3cb7ed16f1c51812cbe4b29493d6c2e5cd1e2b87477263d9534d" dependencies = [ - "doc-comment", "liquid-core", "liquid-derive", "liquid-lib", @@ -2458,15 +2579,14 @@ dependencies = [ [[package]] name = "liquid-core" -version = "0.26.4" +version = "0.26.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79e0724dfcaad5cfb7965ea0f178ca0870b8d7315178f4a7179f5696f7f04d5f" +checksum = "fc623edee8a618b4543e8e8505584f4847a4e51b805db1af6d9af0a3395d0d57" dependencies = [ "anymap2", - "itertools 0.10.5", + "itertools 0.14.0", "kstring", "liquid-derive", - "num-traits", "pest", "pest_derive", "regex", @@ -2476,24 +2596,23 @@ dependencies = [ [[package]] name = "liquid-derive" -version = "0.26.4" +version = "0.26.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc2fb41a9bb4257a3803154bdf7e2df7d45197d1941c9b1a90ad815231630721" +checksum = "de66c928222984aea59fcaed8ba627f388aaac3c1f57dcb05cc25495ef8faefe" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] name = "liquid-lib" -version = "0.26.4" +version = "0.26.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2a17e273a6fb1fb6268f7a5867ddfd0bd4683c7e19b51084f3d567fad4348c0" +checksum = "9befeedd61f5995bc128c571db65300aeb50d62e4f0542c88282dbcb5f72372a" dependencies = [ - "itertools 0.10.5", + "itertools 0.14.0", "liquid-core", - "once_cell", "percent-encoding", "regex", "time", @@ -2502,11 +2621,10 @@ dependencies = [ [[package]] name = "lock_api" -version = "0.4.12" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" dependencies = [ - "autocfg", "scopeguard", ] @@ -2518,11 +2636,11 @@ checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] name = "lru" -version = "0.11.1" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a83fb7698b3643a0e34f9ae6f2e8f0178c0fd42f8b59d493aa271ff3a5bf21" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" dependencies = [ - "hashbrown 0.14.5", + "hashbrown 0.15.5", ] [[package]] @@ -2567,18 +2685,9 @@ checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" [[package]] name = "memmap2" -version = "0.5.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" -dependencies = [ - "libc", -] - -[[package]] -name = "memmap2" -version = "0.6.2" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d28bba84adfe6646737845bc5ebbfa2c08424eb1c37e94a1fd2a82adb56a872" +checksum = "43a5a03cefb0d953ec0be133036f14e109412fa594edc2f77227249db66cc3ed" dependencies = [ "libc", ] @@ -2592,15 +2701,6 @@ dependencies = [ "libc", ] -[[package]] -name = "memoffset" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" -dependencies = [ - "autocfg", -] - [[package]] name = "memoffset" version = "0.7.1" @@ -2621,16 +2721,17 @@ dependencies = [ [[package]] name = "metal" -version = "0.24.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de11355d1f6781482d027a3b4d4de7825dcedb197bf573e0596d00008402d060" +checksum = "c43f73953f8cbe511f021b58f18c3ce1c3d1ae13fe953293e13345bf83217f25" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.5.0", "block", "core-graphics-types", "foreign-types", "log", "objc", + "paste", ] [[package]] @@ -2649,18 +2750,6 @@ dependencies = [ "simd-adler32", ] -[[package]] -name = "mio" -version = "0.8.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" -dependencies = [ - "libc", - "log", - "wasi", - "windows-sys 0.48.0", -] - [[package]] name = "mutate_once" version = "0.1.1" @@ -2669,15 +2758,15 @@ checksum = "16cf681a23b4d0a43fc35024c176437f9dcd818db34e0f42ab456a0ee5ad497b" [[package]] name = "naga" -version = "0.12.3" +version = "0.19.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbcc2e0513220fd2b598e6068608d4462db20322c0e77e47f6f488dfcfc279cb" +checksum = "50e3524642f53d9af419ab5e8dd29d3ba155708267667c2f3f06c88c9e130843" dependencies = [ "bit-set", - "bitflags 1.3.2", + "bitflags 2.5.0", "codespan-reporting", "hexf-parse", - "indexmap 1.9.3", + "indexmap", "log", "num-traits", "rustc-hash", @@ -2701,42 +2790,59 @@ dependencies = [ "serde", ] +[[package]] +name = "ndarray" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + [[package]] name = "ndarray-rand" version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "65608f937acc725f5b164dcf40f4f0bc5d67dc268ab8a649d3002606718c4588" dependencies = [ - "ndarray", + "ndarray 0.15.6", "rand", "rand_distr", ] [[package]] name = "ndk" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "451422b7e4718271c8b5b3aadf5adedba43dc76312454b387e98fae0fc951aa0" +checksum = "2076a31b7010b17a38c01907c45b945e8f11495ee4dd588309718901b1f7a5b7" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.5.0", "jni-sys", - "ndk-sys 0.4.1+23.1.7779620", - "num_enum 0.5.11", + "log", + "ndk-sys 0.5.0+25.2.9519653", + "num_enum", "raw-window-handle", "thiserror", ] [[package]] name = "ndk" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2076a31b7010b17a38c01907c45b945e8f11495ee4dd588309718901b1f7a5b7" +checksum = "c3f42e7bbe13d351b6bead8286a43aac9534b82bd3cc43e47037f012ebfd62d4" dependencies = [ "bitflags 2.5.0", "jni-sys", "log", - "ndk-sys 0.5.0+25.2.9519653", - "num_enum 0.7.2", + "ndk-sys 0.6.0+11769913", + "num_enum", + "raw-window-handle", "thiserror", ] @@ -2746,15 +2852,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27b02d87554356db9e9a873add8782d4ea6e3e58ea071a9adb9a2e8ddb884a8b" -[[package]] -name = "ndk-sys" -version = "0.4.1+23.1.7779620" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cf2aae958bd232cac5069850591667ad422d263686d75b52a065f9badeee5a3" -dependencies = [ - "jni-sys", -] - [[package]] name = "ndk-sys" version = "0.5.0+25.2.9519653" @@ -2765,28 +2862,12 @@ dependencies = [ ] [[package]] -name = "nix" -version = "0.24.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa52e972a9a719cecb6864fb88568781eb706bac2cd1d4f04a648542dbf78069" -dependencies = [ - "bitflags 1.3.2", - "cfg-if", - "libc", - "memoffset 0.6.5", -] - -[[package]] -name = "nix" -version = "0.25.1" +name = "ndk-sys" +version = "0.6.0+11769913" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4" +checksum = "ee6cda3051665f1fb8d9e08fc35c96d5a244fb1be711a03b71118828afc9a873" dependencies = [ - "autocfg", - "bitflags 1.3.2", - "cfg-if", - "libc", - "memoffset 0.6.5", + "jni-sys", ] [[package]] @@ -2799,7 +2880,6 @@ dependencies = [ "cfg-if", "libc", "memoffset 0.7.1", - "pin-utils", ] [[package]] @@ -2824,6 +2904,24 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + +[[package]] +name = "nom-language" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2de2bc5b451bfedaef92c90b8939a8fff5770bdcc1fafd6239d086aab8fa6b29" +dependencies = [ + "nom 8.0.0", +] + [[package]] name = "num-complex" version = "0.4.5" @@ -2836,9 +2934,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.1.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" [[package]] name = "num-derive" @@ -2848,7 +2946,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -2880,55 +2978,13 @@ dependencies = [ "libc", ] -[[package]] -name = "num_enum" -version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f646caf906c20226733ed5b1374287eb97e3c2a5c227ce668c1f2ce20ae57c9" -dependencies = [ - "num_enum_derive 0.5.11", -] - -[[package]] -name = "num_enum" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a015b430d3c108a207fd776d2e2196aaf8b1cf8cf93253e3a097ff3085076a1" -dependencies = [ - "num_enum_derive 0.6.1", -] - [[package]] name = "num_enum" version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845" dependencies = [ - "num_enum_derive 0.7.2", -] - -[[package]] -name = "num_enum_derive" -version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcbff9bc912032c62bf65ef1d5aea88983b420f4f839db1e9b0c281a25c9c799" -dependencies = [ - "proc-macro-crate 1.3.1", - "proc-macro2", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "num_enum_derive" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96667db765a921f7b295ffee8b60472b686a51d4f21c2ee4ffdb94c7013b65a6" -dependencies = [ - "proc-macro-crate 1.3.1", - "proc-macro2", - "quote", - "syn 2.0.60", + "num_enum_derive", ] [[package]] @@ -2940,7 +2996,7 @@ dependencies = [ "proc-macro-crate 3.1.0", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -2950,7 +3006,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef41cbb417ea83b30525259e30ccef6af39b31c240bda578889494c5392d331" dependencies = [ "libc", - "ndarray", + "ndarray 0.15.6", "num-complex", "num-integer", "num-traits", @@ -2981,28 +3037,97 @@ dependencies = [ [[package]] name = "objc-sys" -version = "0.2.0-beta.2" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df3b9834c1e95694a05a828b59f55fa2afec6288359cda67146126b3f90a55d7" +checksum = "cdb91bdd390c7ce1a8607f35f3ca7151b65afc0ff5ff3b34fa350f7d7c7e4310" [[package]] name = "objc2" -version = "0.3.0-beta.3.patch-leaks.3" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e01640f9f2cb1220bbe80325e179e532cb3379ebcd1bf2279d703c19fe3a468" +checksum = "559c5a40fdd30eb5e344fbceacf7595a81e242529fb4e21cf5f43fb4f11ff98d" dependencies = [ - "block2", "objc-sys", - "objc2-encode", + "objc2-encode 3.0.0", +] + +[[package]] +name = "objc2" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f" +dependencies = [ + "objc2-encode 4.1.0", +] + +[[package]] +name = "objc2-core-foundation" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" +dependencies = [ + "bitflags 2.5.0", + "dispatch2", + "objc2 0.6.4", +] + +[[package]] +name = "objc2-core-graphics" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e022c9d066895efa1345f8e33e584b9f958da2fd4cd116792e15e07e4720a807" +dependencies = [ + "bitflags 2.5.0", + "dispatch2", + "objc2 0.6.4", + "objc2-core-foundation", + "objc2-io-surface", ] [[package]] name = "objc2-encode" -version = "2.0.0-pre.2" +version = "3.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abfcac41015b00a120608fdaa6938c44cb983fee294351cc4bac7638b4e50512" +checksum = "d079845b37af429bfe5dfa76e6d087d788031045b25cfc6fd898486fd9847666" + +[[package]] +name = "objc2-encode" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33" + +[[package]] +name = "objc2-foundation" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3e0adef53c21f888deb4fa59fc59f7eb17404926ee8a6f59f5df0fd7f9f3272" dependencies = [ - "objc-sys", + "bitflags 2.5.0", + "objc2 0.6.4", + "objc2-core-foundation", +] + +[[package]] +name = "objc2-io-surface" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "180788110936d59bab6bd83b6060ffdfffb3b922ba1396b312ae795e1de9d81d" +dependencies = [ + "bitflags 2.5.0", + "objc2 0.6.4", + "objc2-core-foundation", +] + +[[package]] +name = "objc2-quartz-core" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96c1358452b371bf9f104e21ec536d37a650eb10f7ee379fff67d2e08d537f1f" +dependencies = [ + "bitflags 2.5.0", + "objc2 0.6.4", + "objc2-core-foundation", + "objc2-foundation", ] [[package]] @@ -3101,9 +3226,9 @@ dependencies = [ [[package]] name = "ouroboros" -version = "0.17.2" +version = "0.18.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2ba07320d39dfea882faa70554b4bd342a5f273ed59ba7c1c6b4c840492c954" +checksum = "1e0f050db9c44b97a94723127e6be766ac5c340c48f2c4bb3ffa11713744be59" dependencies = [ "aliasable", "ouroboros_macro", @@ -3112,15 +3237,15 @@ dependencies = [ [[package]] name = "ouroboros_macro" -version = "0.17.2" +version = "0.18.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec4c6225c69b4ca778c0aea097321a64c421cf4577b331c61b229267edabb6f8" +checksum = "3c7028bdd3d43083f6d8d4d5187680d0d3560d54df4cc9d752005268b41e64d0" dependencies = [ "heck 0.4.1", - "proc-macro-error", "proc-macro2", + "proc-macro2-diagnostics", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -3153,7 +3278,7 @@ dependencies = [ "by_address", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -3175,12 +3300,12 @@ dependencies = [ [[package]] name = "parking_lot" -version = "0.12.2" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e4af0ca4f6caed20e900d564c242b8e5d4903fdacf31d3daf527b66fe6f42fb" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" dependencies = [ "lock_api", - "parking_lot_core 0.9.10", + "parking_lot_core 0.9.12", ] [[package]] @@ -3199,15 +3324,15 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.10" +version = "0.9.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "cfg-if", "libc", "redox_syscall 0.5.1", "smallvec", - "windows-targets 0.52.5", + "windows-link", ] [[package]] @@ -3216,6 +3341,12 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" +[[package]] +name = "pastey" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" + [[package]] name = "percent-encoding" version = "2.3.1" @@ -3253,7 +3384,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -3297,7 +3428,7 @@ dependencies = [ "phf_shared", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -3388,6 +3519,15 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" +[[package]] +name = "portable-atomic-util" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618" +dependencies = [ + "portable-atomic", +] + [[package]] name = "powerfmt" version = "0.2.0" @@ -3400,6 +3540,12 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "presser" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8cf8e6a8aa66ce33f63993ffc4ea4271eb5b0530a9002db8455ea6050c77bfa" + [[package]] name = "primal-check" version = "0.3.3" @@ -3437,7 +3583,6 @@ dependencies = [ "proc-macro-error-attr", "proc-macro2", "quote", - "syn 1.0.109", "version_check", ] @@ -3454,13 +3599,26 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.81" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" dependencies = [ "unicode-ident", ] +[[package]] +name = "proc-macro2-diagnostics" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", + "version_check", + "yansi", +] + [[package]] name = "profiling" version = "1.0.15" @@ -3500,7 +3658,7 @@ dependencies = [ "indoc", "libc", "memoffset 0.9.1", - "parking_lot 0.12.2", + "parking_lot 0.11.2", "portable-atomic", "pyo3-build-config", "pyo3-ffi", @@ -3537,7 +3695,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -3550,7 +3708,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -3562,15 +3720,6 @@ dependencies = [ "bytemuck", ] -[[package]] -name = "quick-xml" -version = "0.28.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce5e73202a820a31f8a0ee32ada5e21029c81fd9e3ebf668a40832e4219d9d1" -dependencies = [ - "memchr", -] - [[package]] name = "quick-xml" version = "0.31.0" @@ -3652,9 +3801,9 @@ checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684" [[package]] name = "raw-window-handle" -version = "0.5.2" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2ff9a1f06a88b01621b7ae906ef0211290d1c8a168a15542486a8f61c0833b9" +checksum = "20675572f6f24e9e76ef639bc5552774ed45f1c30e2951e1e99c59888861c539" [[package]] name = "rawpointer" @@ -3793,6 +3942,12 @@ version = "0.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c36d2bbc763f480668d6d6790ae2fdd2e52ac0c21a3a26d156f3534a3d9eea9" +[[package]] +name = "roxmltree" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c20b6793b5c2fa6553b250154b78d6d0db37e72700ae35fad9387a46f487c97" + [[package]] name = "rstest" version = "0.19.0" @@ -3818,7 +3973,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.60", + "syn 2.0.117", "unicode-ident", ] @@ -3834,6 +3989,18 @@ dependencies = [ "realfft", ] +[[package]] +name = "rubato" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5d18b486e7d29a408ef3f825bc1327d8f87af091c987ca2f5b734625940e234" +dependencies = [ + "num-complex", + "num-integer", + "num-traits", + "realfft", +] + [[package]] name = "rust-ini" version = "0.21.0" @@ -3908,20 +4075,39 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags 2.5.0", + "errno", + "libc", + "linux-raw-sys 0.12.1", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + [[package]] name = "rustybuzz" -version = "0.8.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82eea22c8f56965eeaf3a209b3d24508256c7b920fb3b6211b8ba0f7c0583250" +checksum = "2ee8fe2a8461a0854a37101fe7a1b13998d0cfa987e43248e81d2a5f4570f6fa" dependencies = [ "bitflags 1.3.2", "bytemuck", "libm", "smallvec", - "ttf-parser 0.19.2", + "ttf-parser 0.20.0", "unicode-bidi-mirroring", "unicode-ccc", - "unicode-general-category", + "unicode-properties", "unicode-script", ] @@ -3931,6 +4117,16 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +[[package]] +name = "safetensors" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "172dd94c5a87b5c79f945c863da53b2ebc7ccef4eca24ac63cca66a41aab2178" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "same-file" version = "1.0.6" @@ -3963,17 +4159,23 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "sctk-adwaita" -version = "0.5.4" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cda4e97be1fd174ccc2aae81c8b694e803fa99b34e8fd0f057a9d70698e3ed09" +checksum = "70b31447ca297092c5a9916fc3b955203157b37c19ca8edde4f52e9843e602c7" dependencies = [ "ab_glyph", "log", - "memmap2 0.5.10", - "smithay-client-toolkit 0.16.1", - "tiny-skia 0.8.4", + "memmap2 0.9.4", + "smithay-client-toolkit", + "tiny-skia", ] +[[package]] +name = "self_cell" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b12e76d157a900eb52e81bc6e9f3069344290341720e9178cde2407113ac8d89" + [[package]] name = "semver" version = "1.0.22" @@ -3982,22 +4184,32 @@ checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" [[package]] name = "serde" -version = "1.0.200" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddc6f9cc94d67c0e21aaf7eda3a010fd3af78ebf6e096aa6e2e13c79749cce4f" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.200" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "856f046b9400cee3c8c94ed572ecdb752444c24528c035cd35882aad6f492bcb" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -4019,7 +4231,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -4095,25 +4307,6 @@ version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" -[[package]] -name = "smithay-client-toolkit" -version = "0.16.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "870427e30b8f2cbe64bf43ec4b86e88fe39b0a84b3f15efd9c9c2d020bc86eb9" -dependencies = [ - "bitflags 1.3.2", - "calloop 0.10.6", - "dlib", - "lazy_static", - "log", - "memmap2 0.5.10", - "nix 0.24.3", - "pkg-config", - "wayland-client 0.29.5", - "wayland-cursor 0.29.5", - "wayland-protocols 0.29.5", -] - [[package]] name = "smithay-client-toolkit" version = "0.18.1" @@ -4121,7 +4314,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "922fd3eeab3bd820d76537ce8f582b1cf951eceb5475c28500c7457d9d17f53a" dependencies = [ "bitflags 2.5.0", - "calloop 0.12.4", + "calloop", "calloop-wayland-source", "cursor-icon", "libc", @@ -4129,13 +4322,13 @@ dependencies = [ "memmap2 0.9.4", "rustix 0.38.34", "thiserror", - "wayland-backend 0.3.3", - "wayland-client 0.31.2", + "wayland-backend", + "wayland-client", "wayland-csd-frame", - "wayland-cursor 0.31.1", - "wayland-protocols 0.31.2", + "wayland-cursor", + "wayland-protocols", "wayland-protocols-wlr", - "wayland-scanner 0.31.1", + "wayland-scanner", "xkeysym", ] @@ -4146,8 +4339,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c091e7354ea8059d6ad99eace06dd13ddeedbb0ac72d40a9a6e7ff790525882d" dependencies = [ "libc", - "smithay-client-toolkit 0.18.1", - "wayland-backend 0.3.3", + "smithay-client-toolkit", + "wayland-backend", +] + +[[package]] +name = "smol_str" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd538fb6910ac1099850255cf94a94df6551fbdd602454387d0adb2d1ca6dead" +dependencies = [ + "serde", ] [[package]] @@ -4162,30 +4364,34 @@ dependencies = [ [[package]] name = "softbuffer" -version = "0.2.1" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2b953f6ba7285f0af131eb748aabd8ddaf53e0b81dda3ba5d803b0847d6559f" +checksum = "aac18da81ebbf05109ab275b157c22a653bb3c12cf884450179942f81bcbf6c3" dependencies = [ + "as-raw-xcb-connection", "bytemuck", - "cfg_aliases", - "cocoa", - "core-graphics", - "fastrand 1.9.0", - "foreign-types", - "log", - "nix 0.26.4", - "objc", + "drm", + "fastrand 2.1.0", + "js-sys", + "memmap2 0.9.4", + "ndk 0.9.0", + "objc2 0.6.4", + "objc2-core-foundation", + "objc2-core-graphics", + "objc2-foundation", + "objc2-quartz-core", "raw-window-handle", - "redox_syscall 0.3.5", - "thiserror", + "redox_syscall 0.5.1", + "rustix 1.1.4", + "tiny-xlib", + "tracing", "wasm-bindgen", - "wayland-backend 0.1.2", - "wayland-client 0.30.2", - "wayland-sys 0.30.1", + "wayland-backend", + "wayland-client", + "wayland-sys", "web-sys", - "windows-sys 0.48.0", - "x11-dl", - "x11rb 0.11.1", + "windows-sys 0.61.2", + "x11rb", ] [[package]] @@ -4199,12 +4405,11 @@ dependencies = [ [[package]] name = "spirv" -version = "0.2.0+1.5.4" +version = "0.3.0+sdk-1.3.268.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "246bfa38fe3db3f1dfc8ca5a2cdeb7348c78be2112740cc0ec8ef18b6d94f830" +checksum = "eda41003dc44290527a59b13432d4a0379379fa074b70174882adfbdfd917844" dependencies = [ - "bitflags 1.3.2", - "num-traits", + "bitflags 2.5.0", ] [[package]] @@ -4213,12 +4418,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "str-buf" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e08d8363704e6c71fc928674353e6b7c23dcea9d82d7012c8faf2a3a025f8d0" - [[package]] name = "strength_reduce" version = "0.2.4" @@ -4278,9 +4477,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.60" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "909518bc7b1c9b779f1bbf07f2929d35af9f0f37e47c6e9ef7f9dddc1e1821f3" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", @@ -4351,7 +4550,7 @@ checksum = "d1cd413b5d558b4c5bf3680e324a6fa5014e7b7c067a51e69dbdf47eb7148b66" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] @@ -4367,30 +4566,30 @@ dependencies = [ [[package]] name = "time" -version = "0.3.36" +version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", "itoa", "num-conv", "powerfmt", - "serde", + "serde_core", "time-core", "time-macros", ] [[package]] name = "time-core" -version = "0.1.2" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" [[package]] name = "time-macros" -version = "0.2.18" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" dependencies = [ "num-conv", "time-core", @@ -4399,31 +4598,17 @@ dependencies = [ [[package]] name = "tiny-keccak" version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" -dependencies = [ - "crunchy", -] - -[[package]] -name = "tiny-skia" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df8493a203431061e901613751931f047d1971337153f96d0e5e363d6dbf6a67" -dependencies = [ - "arrayref", - "arrayvec", - "bytemuck", - "cfg-if", - "png", - "tiny-skia-path 0.8.4", +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", ] [[package]] name = "tiny-skia" -version = "0.10.0" +version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7db11798945fa5c3e5490c794ccca7c6de86d3afdd54b4eb324109939c6f37bc" +checksum = "83d13394d44dae3207b52a326c0c85a8bf87f1541f23b0d143811088497b09ab" dependencies = [ "arrayref", "arrayvec", @@ -4431,14 +4616,14 @@ dependencies = [ "cfg-if", "log", "png", - "tiny-skia-path 0.10.0", + "tiny-skia-path", ] [[package]] name = "tiny-skia-path" -version = "0.8.4" +version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adbfb5d3f3dd57a0e11d12f4f13d4ebbbc1b5c15b7ab0a156d030b21da5f677c" +checksum = "9c9e7fc0c2e86a30b117d0462aa261b72b7a99b7ebd7deb3a14ceda95c5bdc93" dependencies = [ "arrayref", "bytemuck", @@ -4446,14 +4631,16 @@ dependencies = [ ] [[package]] -name = "tiny-skia-path" -version = "0.10.0" +name = "tiny-xlib" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f60aa35c89ac2687ace1a2556eaaea68e8c0d47408a2e3e7f5c98a489e7281c" +checksum = "a90a0ca3ee6a69f2ad28fd11621a4c3f03b371f366be500b64df260c4ffbafb4" dependencies = [ - "arrayref", - "bytemuck", - "strict-num", + "as-raw-xcb-connection", + "ctor", + "libloading 0.8.3", + "pkg-config", + "tracing", ] [[package]] @@ -4494,7 +4681,7 @@ version = "0.19.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421" dependencies = [ - "indexmap 2.2.6", + "indexmap", "toml_datetime", "winnow", ] @@ -4505,16 +4692,16 @@ version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1" dependencies = [ - "indexmap 2.2.6", + "indexmap", "toml_datetime", "winnow", ] [[package]] name = "tracing" -version = "0.1.40" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ "pin-project-lite", "tracing-attributes", @@ -4523,31 +4710,32 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.27" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] name = "tracing-core" -version = "0.1.32" +version = "0.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", ] [[package]] name = "tract-core" -version = "0.21.4" +version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98ef3e1f2d94e88d007811e78a3dcc9e5734bfd841849416fab09a407488cb7d" +checksum = "b65d67f5190132365dda73fe215bfc5e01b031e8cbfbea9d486bb5b0dbba3545" dependencies = [ "anyhow", + "anymap3", "bit-set", "derive-new", "downcast-rs", @@ -4555,11 +4743,11 @@ dependencies = [ "lazy_static", "log", "maplit", - "ndarray", + "ndarray 0.16.1", "num-complex", "num-integer", "num-traits", - "paste", + "pastey", "rustfft", "smallvec", "tract-data", @@ -4568,19 +4756,25 @@ dependencies = [ [[package]] name = "tract-data" -version = "0.21.4" +version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "580103fb6de703d9bff3d64cfe76c26d454cbcb4b84716d3f2936e36c8a88d67" +checksum = "73cd7fda1e5e8b854ea3abdd09126a87fc4af81e6d1e29ec1710a8a4abf4f13a" dependencies = [ "anyhow", + "downcast-rs", + "dyn-clone", + "dyn-hash", "half", "itertools 0.12.1", "lazy_static", + "libm", "maplit", - "ndarray", - "nom", + "ndarray 0.16.1", + "nom 8.0.0", + "nom-language", "num-integer", "num-traits", + "parking_lot 0.12.5", "scan_fmt", "smallvec", "string-interner", @@ -4588,9 +4782,9 @@ dependencies = [ [[package]] name = "tract-hir" -version = "0.21.4" +version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14a1deb233efc188da3617e66160202ef08247f91f6e12a39940dcb74b5a6af3" +checksum = "554df991b647dba8af0547ee5838b6912ed20b424f2adda0ea0b7faf8db1b151" dependencies = [ "derive-new", "log", @@ -4599,21 +4793,22 @@ dependencies = [ [[package]] name = "tract-linalg" -version = "0.21.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b58f074c94c74ea736a75b7ac6f696add05c62fc4745d1c420cf7d4d42eb7b2b" +version = "0.22.1" dependencies = [ + "byteorder", "cc", "derive-new", "downcast-rs", "dyn-clone", + "dyn-hash", "half", "lazy_static", "liquid", "liquid-core", + "liquid-derive", "log", "num-traits", - "paste", + "pastey", "scan_fmt", "smallvec", "time", @@ -4624,14 +4819,19 @@ dependencies = [ [[package]] name = "tract-nnef" -version = "0.21.4" +version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630653ba2da4e55bddf1e7fad9d4e128dcb87200de621d12dc9a48a81bdaedd8" +checksum = "45b3755dd0948111b407085d11033ba218cb85b85ce8d795cec2b8353db552ea" dependencies = [ "byteorder", "flate2", + "liquid", + "liquid-core", "log", - "nom", + "nom 8.0.0", + "nom-language", + "safetensors", + "serde_json", "tar", "tract-core", "walkdir", @@ -4639,9 +4839,9 @@ dependencies = [ [[package]] name = "tract-onnx" -version = "0.21.4" +version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cb5d5898db7dd7d7051d50365aebcb48b075d6f8bb17ce1f9e75e6e452aed2a" +checksum = "ac23ad1d2d5da3256ae1a78757b1072a8a3fac2a4b28d27cfb561c5942ec2701" dependencies = [ "bytes", "derive-new", @@ -4657,9 +4857,9 @@ dependencies = [ [[package]] name = "tract-onnx-opl" -version = "0.21.4" +version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "620e3c0036ad846d62cc44c3b430ba0e1a22afe9d1abf0cc7f0e4caa3e232186" +checksum = "87561bf0b84f74a124afc0f1997682728da6cd821083511e0357432954fd24f6" dependencies = [ "getrandom", "log", @@ -4671,9 +4871,9 @@ dependencies = [ [[package]] name = "tract-pulse" -version = "0.21.4" +version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dcd04336e207e760ce1a47f152664f34c14f3a9ead3af7c16ea2d9d520cf8ac" +checksum = "ff926428bf533d0d8ee70e2626fb9f8197d33d3cc9e0cafc3f9acf8e11b4dd93" dependencies = [ "downcast-rs", "lazy_static", @@ -4683,9 +4883,9 @@ dependencies = [ [[package]] name = "tract-pulse-opl" -version = "0.21.4" +version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acc42aebbb6ae3300e2435c3ff67f1520636f1b9196644d0f92edcd8b94c88bc" +checksum = "5621466758a263fb3baf6494a9aca555dd90c1c0c5216186987a1857bca21f87" dependencies = [ "downcast-rs", "lazy_static", @@ -4720,17 +4920,6 @@ version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17f77d76d837a7830fe1d4f12b7b4ba4192c1888001c7164257e4bc6d21d96b4" -[[package]] -name = "twox-hash" -version = "1.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" -dependencies = [ - "cfg-if", - "rand", - "static_assertions", -] - [[package]] name = "typenum" version = "1.17.0" @@ -4772,12 +4961,6 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc2520efa644f8268dce4dcd3050eaa7fc044fca03961e9998ac7e2e92b77cf1" -[[package]] -name = "unicode-general-category" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2281c8c1d221438e373249e065ca4989c4c36952c211ff21a0ee91c44a3869e7" - [[package]] name = "unicode-ident" version = "1.0.12" @@ -4799,6 +4982,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-properties" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" + [[package]] name = "unicode-script" version = "0.5.6" @@ -4851,12 +5040,6 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8cdc8b93bd0198ed872357fb2e667f7125646b1762f16d60b2c96350d361897" -[[package]] -name = "vec_map" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" - [[package]] name = "version_check" version = "0.9.4" @@ -4887,26 +5070,14 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.92" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89" dependencies = [ "cfg-if", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.92" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" -dependencies = [ - "bumpalo", - "log", "once_cell", - "proc-macro2", - "quote", - "syn 2.0.60", + "rustversion", + "wasm-bindgen-macro", "wasm-bindgen-shared", ] @@ -4924,9 +5095,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.92" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -4934,22 +5105,25 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.92" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904" dependencies = [ + "bumpalo", "proc-macro2", "quote", - "syn 2.0.60", - "wasm-bindgen-backend", + "syn 2.0.117", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.92" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" +checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129" +dependencies = [ + "unicode-ident", +] [[package]] name = "wasm-timer" @@ -4966,21 +5140,6 @@ dependencies = [ "web-sys", ] -[[package]] -name = "wayland-backend" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41b48e27457e8da3b2260ac60d0a94512f5cba36448679f3747c0865b7893ed8" -dependencies = [ - "cc", - "downcast-rs", - "io-lifetimes", - "nix 0.26.4", - "scoped-tls", - "smallvec", - "wayland-sys 0.30.1", -] - [[package]] name = "wayland-backend" version = "0.3.3" @@ -4992,35 +5151,7 @@ dependencies = [ "rustix 0.38.34", "scoped-tls", "smallvec", - "wayland-sys 0.31.1", -] - -[[package]] -name = "wayland-client" -version = "0.29.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f3b068c05a039c9f755f881dc50f01732214f5685e379829759088967c46715" -dependencies = [ - "bitflags 1.3.2", - "downcast-rs", - "libc", - "nix 0.24.3", - "scoped-tls", - "wayland-commons", - "wayland-scanner 0.29.5", - "wayland-sys 0.29.5", -] - -[[package]] -name = "wayland-client" -version = "0.30.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "489c9654770f674fc7e266b3c579f4053d7551df0ceb392f153adb1f9ed06ac8" -dependencies = [ - "bitflags 1.3.2", - "nix 0.26.4", - "wayland-backend 0.1.2", - "wayland-scanner 0.30.1", + "wayland-sys", ] [[package]] @@ -5031,20 +5162,8 @@ checksum = "82fb96ee935c2cea6668ccb470fb7771f6215d1691746c2d896b447a00ad3f1f" dependencies = [ "bitflags 2.5.0", "rustix 0.38.34", - "wayland-backend 0.3.3", - "wayland-scanner 0.31.1", -] - -[[package]] -name = "wayland-commons" -version = "0.29.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8691f134d584a33a6606d9d717b95c4fa20065605f798a3f350d78dced02a902" -dependencies = [ - "nix 0.24.3", - "once_cell", - "smallvec", - "wayland-sys 0.29.5", + "wayland-backend", + "wayland-scanner", ] [[package]] @@ -5055,18 +5174,7 @@ checksum = "625c5029dbd43d25e6aa9615e88b829a5cad13b2819c4ae129fdbb7c31ab4c7e" dependencies = [ "bitflags 2.5.0", "cursor-icon", - "wayland-backend 0.3.3", -] - -[[package]] -name = "wayland-cursor" -version = "0.29.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6865c6b66f13d6257bef1cd40cbfe8ef2f150fb8ebbdb1e8e873455931377661" -dependencies = [ - "nix 0.24.3", - "wayland-client 0.29.5", - "xcursor", + "wayland-backend", ] [[package]] @@ -5076,22 +5184,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71ce5fa868dd13d11a0d04c5e2e65726d0897be8de247c0c5a65886e283231ba" dependencies = [ "rustix 0.38.34", - "wayland-client 0.31.2", + "wayland-client", "xcursor", ] -[[package]] -name = "wayland-protocols" -version = "0.29.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b950621f9354b322ee817a23474e479b34be96c2e909c14f7bc0100e9a970bc6" -dependencies = [ - "bitflags 1.3.2", - "wayland-client 0.29.5", - "wayland-commons", - "wayland-scanner 0.29.5", -] - [[package]] name = "wayland-protocols" version = "0.31.2" @@ -5099,44 +5195,35 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f81f365b8b4a97f422ac0e8737c438024b5951734506b0e1d775c73030561f4" dependencies = [ "bitflags 2.5.0", - "wayland-backend 0.3.3", - "wayland-client 0.31.2", - "wayland-scanner 0.31.1", + "wayland-backend", + "wayland-client", + "wayland-scanner", ] [[package]] -name = "wayland-protocols-wlr" +name = "wayland-protocols-plasma" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad1f61b76b6c2d8742e10f9ba5c3737f6530b4c243132c2a2ccc8aa96fe25cd6" +checksum = "23803551115ff9ea9bce586860c5c5a971e360825a0309264102a9495a5ff479" dependencies = [ "bitflags 2.5.0", - "wayland-backend 0.3.3", - "wayland-client 0.31.2", - "wayland-protocols 0.31.2", - "wayland-scanner 0.31.1", -] - -[[package]] -name = "wayland-scanner" -version = "0.29.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f4303d8fa22ab852f789e75a967f0a2cdc430a607751c0499bada3e451cbd53" -dependencies = [ - "proc-macro2", - "quote", - "xml-rs", + "wayland-backend", + "wayland-client", + "wayland-protocols", + "wayland-scanner", ] [[package]] -name = "wayland-scanner" -version = "0.30.1" +name = "wayland-protocols-wlr" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9b873b257fbc32ec909c0eb80dea312076a67014e65e245f5eb69a6b8ab330e" +checksum = "ad1f61b76b6c2d8742e10f9ba5c3737f6530b4c243132c2a2ccc8aa96fe25cd6" dependencies = [ - "proc-macro2", - "quick-xml 0.28.2", - "quote", + "bitflags 2.5.0", + "wayland-backend", + "wayland-client", + "wayland-protocols", + "wayland-scanner", ] [[package]] @@ -5146,50 +5233,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63b3a62929287001986fb58c789dce9b67604a397c15c611ad9f747300b6c283" dependencies = [ "proc-macro2", - "quick-xml 0.31.0", + "quick-xml", "quote", ] [[package]] name = "wayland-sys" -version = "0.29.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be12ce1a3c39ec7dba25594b97b42cb3195d54953ddb9d3d95a7c3902bc6e9d4" -dependencies = [ - "dlib", - "lazy_static", - "pkg-config", -] - -[[package]] -name = "wayland-sys" -version = "0.30.1" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96b2a02ac608e07132978689a6f9bf4214949c85998c247abadd4f4129b1aa06" +checksum = "15a0c8eaff5216d07f226cb7a549159267f3467b289d9a2e52fd3ef5aae2b7af" dependencies = [ "dlib", - "lazy_static", "log", + "once_cell", "pkg-config", ] [[package]] -name = "wayland-sys" -version = "0.31.1" +name = "web-sys" +version = "0.3.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15a0c8eaff5216d07f226cb7a549159267f3467b289d9a2e52fd3ef5aae2b7af" +checksum = "58cd2333b6e0be7a39605f0e255892fd7418a682d8da8fe042fe25128794d2ed" dependencies = [ - "dlib", - "log", - "once_cell", - "pkg-config", + "js-sys", + "wasm-bindgen", ] [[package]] -name = "web-sys" -version = "0.3.69" +name = "web-time" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +checksum = "aa30049b1c872b72c89866d458eae9f20380ab280ffd1b1e18df2d3e2d98cfe0" dependencies = [ "js-sys", "wasm-bindgen", @@ -5203,16 +5277,17 @@ checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082" [[package]] name = "wgpu" -version = "0.16.3" +version = "0.19.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "480c965c9306872eb6255fa55e4b4953be55a8b64d57e61d7ff840d3dcc051cd" +checksum = "cbd7311dbd2abcfebaabf1841a2824ed7c8be443a0f29166e5d3c6a53a762c01" dependencies = [ "arrayvec", "cfg-if", + "cfg_aliases", "js-sys", "log", "naga", - "parking_lot 0.12.2", + "parking_lot 0.11.2", "profiling", "raw-window-handle", "smallvec", @@ -5227,17 +5302,20 @@ dependencies = [ [[package]] name = "wgpu-core" -version = "0.16.1" +version = "0.19.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f478237b4bf0d5b70a39898a66fa67ca3a007d79f2520485b8b0c3dfc46f8c2" +checksum = "28b94525fc99ba9e5c9a9e24764f2bc29bad0911a7446c12f446a8277369bf3a" dependencies = [ "arrayvec", "bit-vec", "bitflags 2.5.0", + "cfg_aliases", "codespan-reporting", + "indexmap", "log", "naga", - "parking_lot 0.12.2", + "once_cell", + "parking_lot 0.11.2", "profiling", "raw-window-handle", "rustc-hash", @@ -5250,9 +5328,9 @@ dependencies = [ [[package]] name = "wgpu-hal" -version = "0.16.2" +version = "0.19.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ecb3258078e936deee14fd4e0febe1cfe9bbb5ffef165cb60218d2ee5eb4448" +checksum = "bfabcfc55fd86611a855816326b2d54c3b2fd7972c27ce414291562650552703" dependencies = [ "android_system_properties", "arrayvec", @@ -5260,10 +5338,11 @@ dependencies = [ "bit-set", "bitflags 2.5.0", "block", + "cfg_aliases", "core-graphics-types", "d3d12", - "foreign-types", "glow", + "glutin_wgl_sys", "gpu-alloc", "gpu-allocator", "gpu-descriptor", @@ -5275,8 +5354,10 @@ dependencies = [ "log", "metal", "naga", + "ndk-sys 0.5.0+25.2.9519653", "objc", - "parking_lot 0.12.2", + "once_cell", + "parking_lot 0.11.2", "profiling", "range-alloc", "raw-window-handle", @@ -5292,9 +5373,9 @@ dependencies = [ [[package]] name = "wgpu-types" -version = "0.16.1" +version = "0.19.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0c153280bb108c2979eb5c7391cb18c56642dd3c072e55f52065e13e2a1252a" +checksum = "b671ff9fb03f78b46ff176494ee1ebe7d603393f42664be55b64dc8d53969805" dependencies = [ "bitflags 2.5.0", "js-sys", @@ -5332,15 +5413,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "winapi-wsapoll" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1eafc5f679c576995526e81635d0cf9695841736712b4e892f87abbe6fed3f28" -dependencies = [ - "winapi", -] - [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -5349,9 +5421,9 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "window_clipboard" -version = "0.3.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63287c9c4396ccf5346d035a9b0fcaead9e18377637f5eaa78b7ac65c873ff7d" +checksum = "f6d692d46038c433f9daee7ad8757e002a4248c20b0a3fbc991d99521d3bcb6d" dependencies = [ "clipboard-win", "clipboard_macos", @@ -5363,11 +5435,12 @@ dependencies = [ [[package]] name = "windows" -version = "0.44.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e745dab35a0c4c77aa3ce42d595e13d2003d6902d6b08c9ef5fc326d08da12b" +checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" dependencies = [ - "windows-targets 0.42.2", + "windows-core 0.52.0", + "windows-targets 0.52.5", ] [[package]] @@ -5376,7 +5449,16 @@ version = "0.54.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9252e5725dbed82865af151df558e754e4a3c2c30818359eb17465f1346a1b49" dependencies = [ - "windows-core", + "windows-core 0.54.0", + "windows-targets 0.52.5", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ "windows-targets 0.52.5", ] @@ -5390,6 +5472,12 @@ dependencies = [ "windows-targets 0.52.5", ] +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + [[package]] name = "windows-result" version = "0.1.1" @@ -5426,6 +5514,15 @@ dependencies = [ "windows-targets 0.52.5", ] +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + [[package]] name = "windows-targets" version = "0.42.2" @@ -5606,37 +5703,50 @@ checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" [[package]] name = "winit" -version = "0.28.7" +version = "0.29.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9596d90b45384f5281384ab204224876e8e8bf7d58366d9b795ad99aa9894b94" +checksum = "0d59ad965a635657faf09c8f062badd885748428933dad8e8bdd64064d92e5ca" dependencies = [ + "ahash", "android-activity", - "bitflags 1.3.2", + "atomic-waker", + "bitflags 2.5.0", + "bytemuck", + "calloop", "cfg_aliases", "core-foundation", "core-graphics", - "dispatch", - "instant", + "cursor-icon", + "icrate", + "js-sys", "libc", "log", - "mio", - "ndk 0.7.0", - "objc2", + "memmap2 0.9.4", + "ndk 0.8.0", + "ndk-sys 0.5.0+25.2.9519653", + "objc2 0.4.1", "once_cell", "orbclient", "percent-encoding", "raw-window-handle", "redox_syscall 0.3.5", + "rustix 0.38.34", "sctk-adwaita", - "smithay-client-toolkit 0.16.1", + "smithay-client-toolkit", + "smol_str", + "unicode-segmentation", "wasm-bindgen", - "wayland-client 0.29.5", - "wayland-commons", - "wayland-protocols 0.29.5", - "wayland-scanner 0.29.5", + "wasm-bindgen-futures", + "wayland-backend", + "wayland-client", + "wayland-protocols", + "wayland-protocols-plasma", "web-sys", - "windows-sys 0.45.0", + "web-time", + "windows-sys 0.48.0", "x11-dl", + "x11rb", + "xkbcommon-dl", ] [[package]] @@ -5670,40 +5780,19 @@ dependencies = [ "pkg-config", ] -[[package]] -name = "x11rb" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdf3c79412dd91bae7a7366b8ad1565a85e35dd049affc3a6a2c549e97419617" -dependencies = [ - "gethostname 0.2.3", - "libc", - "libloading 0.7.4", - "nix 0.25.1", - "once_cell", - "winapi", - "winapi-wsapoll", - "x11rb-protocol 0.11.1", -] - [[package]] name = "x11rb" version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d91ffca73ee7f68ce055750bf9f6eca0780b8c85eff9bc046a3b0da41755e12" dependencies = [ - "gethostname 0.4.3", + "as-raw-xcb-connection", + "gethostname", + "libc", + "libloading 0.8.3", + "once_cell", "rustix 0.38.34", - "x11rb-protocol 0.13.1", -] - -[[package]] -name = "x11rb-protocol" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0b1513b141123073ce54d5bb1d33f801f17508fbd61e02060b1214e96d39c56" -dependencies = [ - "nix 0.25.1", + "x11rb-protocol", ] [[package]] @@ -5739,6 +5828,19 @@ dependencies = [ "winapi", ] +[[package]] +name = "xkbcommon-dl" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d039de8032a9a8856a6be89cea3e5d12fdd82306ab7c94d74e6deab2460651c5" +dependencies = [ + "bitflags 2.5.0", + "dlib", + "log", + "once_cell", + "xkeysym", +] + [[package]] name = "xkeysym" version = "0.2.0" @@ -5751,6 +5853,18 @@ version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "791978798f0597cfc70478424c2b4fdc2b7a8024aaff78497ef00f24ef674193" +[[package]] +name = "xxhash-rust" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" + +[[package]] +name = "yansi" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" + [[package]] name = "yazi" version = "0.1.6" @@ -5846,7 +5960,7 @@ checksum = "6f4b6c273f496d8fd4eaf18853e6b448760225dc030ff2c485a786859aea6393" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.117", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 1019dbedd..8a5306657 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,3 +25,12 @@ debug = false lto = "fat" strip = true panic = "abort" + +# Local patched tract-linalg with wasm_f32_4x1 GEMV kernel. +# Adds a 4-row × 1-col SIMD kernel handling N=1 matrix-vector ops without the +# 75% column-tile waste of the existing wasm_f32_4x4. Source at +# /Users/CZoli/Desktop/tract-fork/tract-linalg-0.22.1/src/wasm.rs. +[patch.crates-io] +tract-linalg = { path = "/Users/CZoli/Desktop/tract-fork/tract-linalg-0.22.1" } + + diff --git a/libDF/Cargo.toml b/libDF/Cargo.toml index c0f3271ef..879c98ee0 100644 --- a/libDF/Cargo.toml +++ b/libDF/Cargo.toml @@ -120,10 +120,10 @@ claxon = { version = "^0.4", optional = true } env_logger = { version = "0.11", optional = true } clap = { version = "4.0", optional = true, features = ["derive"] } rust-ini = { version = "^0.21", optional = true } -tract-core = { version = "^0.21.4", optional = true } -tract-onnx = { version = "^0.21.4", optional = true } -tract-pulse = { version = "^0.21.4", optional = true } -tract-hir = { version = "^0.21.4", optional = true } +tract-core = { version = "=0.22.1", optional = true } +tract-onnx = { version = "=0.22.1", optional = true } +tract-pulse = { version = "=0.22.1", optional = true } +tract-hir = { version = "=0.22.1", optional = true } flate2 = { version = "1.0.24", optional = true } tar = { version = "0.4.38", optional = true } wasm-bindgen = { version = "0.2.87", optional = true } diff --git a/libDF/src/bin/enhance_wav.rs b/libDF/src/bin/enhance_wav.rs index b37592660..24e06a8e4 100644 --- a/libDF/src/bin/enhance_wav.rs +++ b/libDF/src/bin/enhance_wav.rs @@ -3,7 +3,7 @@ use std::{path::PathBuf, process::exit, time::Instant}; use anyhow::Result; use clap::{Parser, ValueHint}; use df::{tract::*, transforms::resample, wav_utils::*}; -use ndarray::{prelude::*, Axis}; +use tract_core::ndarray::{self, prelude::*, Axis}; #[cfg(all( not(windows), diff --git a/libDF/src/tract.rs b/libDF/src/tract.rs index b39a98726..2415538f1 100644 --- a/libDF/src/tract.rs +++ b/libDF/src/tract.rs @@ -7,7 +7,7 @@ use std::time::Instant; use anyhow::{bail, Context, Result}; use flate2::read::GzDecoder; use ini::Ini; -use ndarray::{prelude::*, Axis}; +use tract_core::ndarray::{self, prelude::*, Axis}; use tar::Archive; use tract_core::internal::tract_itertools::izip; use tract_core::internal::tract_smallvec::alloc::collections::VecDeque; @@ -772,7 +772,7 @@ fn init_encoder_impl( n_ch: usize, ) -> Result { log::debug!("Start init encoder."); - let s = m.symbol_table.sym("S"); + let s = m.symbols.sym("S"); let nb_erb = df_cfg.get("nb_erb").unwrap().parse::()?; let nb_df = df_cfg.get("nb_df").unwrap().parse::()?; @@ -821,7 +821,7 @@ fn init_erb_decoder_impl( mask_reduction: Option, ) -> Result { log::debug!("Start init ERB decoder."); - let s = m.symbol_table.sym("S"); + let s = m.symbols.sym("S"); let nb_erb = df_cfg.get("nb_erb").unwrap().parse::()?; let layer_width = net_cfg.get("conv_ch").unwrap().parse::()?; @@ -934,7 +934,7 @@ fn init_df_decoder_impl( n_ch: usize, ) -> Result { log::debug!("Start init DF decoder."); - let s = m.symbol_table.sym("S"); + let s = m.symbols.sym("S"); let nb_erb = df_cfg.get("nb_erb").unwrap().parse::()?; let nb_df = df_cfg.get("nb_df").unwrap().parse::()?; diff --git a/libDF/src/transforms.rs b/libDF/src/transforms.rs index 7d6b1175e..9840b2ecd 100644 --- a/libDF/src/transforms.rs +++ b/libDF/src/transforms.rs @@ -1,6 +1,6 @@ use std::mem::MaybeUninit; -use ndarray::{prelude::*, Slice}; +use tract_core::ndarray::{self, prelude::*, Slice}; use rubato::{FftFixedInOut, Resampler}; use thiserror::Error; diff --git a/libDF/src/wasm.rs b/libDF/src/wasm.rs index 2e02095b7..0cfbe64b7 100644 --- a/libDF/src/wasm.rs +++ b/libDF/src/wasm.rs @@ -1,6 +1,6 @@ use std::boxed::Box; -use ndarray::prelude::*; +use tract_core::ndarray::{self, prelude::*}; use wasm_bindgen::prelude::*; use crate::tract::*; diff --git a/libDF/src/wav_utils.rs b/libDF/src/wav_utils.rs index 99e2cbc56..904f52150 100644 --- a/libDF/src/wav_utils.rs +++ b/libDF/src/wav_utils.rs @@ -6,7 +6,7 @@ use std::{ use hound::{WavReader, WavWriter}; #[cfg(any(feature = "dataset", feature = "wav-utils"))] -use ndarray::prelude::*; +use tract_core::ndarray::{self, prelude::*}; use thiserror::Error; #[derive(Error, Debug)] From 5308f75ea9298a5bc07fab77d158e2180b9831f5 Mon Sep 17 00:00:00 2001 From: czoli1976 <64466170+czoli1976@users.noreply.github.com> Date: Tue, 28 Apr 2026 07:24:38 +0100 Subject: [PATCH 02/10] workspace Cargo.toml: repoint tract-linalg patch at GH fork (git source) + describe full kernel kit Two changes in the [patch.crates-io] override: 1. Source swap: absolute local path -> git source on czoli1976/tract. Makes the build reproducible for anyone with access to the fork; was previously pinned to /Users/CZoli/Desktop/tract-fork/tract-linalg-0.22.1 which doesn't exist outside the original investigation environment. 2. Comment updated to describe the full WASM SIMD kernel kit (4x4 existing + 4x1, 8x1, 16x1, 8x4, 8x8 new + per-M dispatcher). The previous comment only mentioned the initial 4x1 kernel and was stale relative to what's actually being patched in. Resulting patch: tract-linalg = { git = "https://github.com/czoli1976/tract", branch = "add-wasm-f32-full-kernel-kit" } Cargo.lock will regenerate on next build (the registry source descriptor for tract-linalg changes from path-based to git-based). --- Cargo.toml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 8a5306657..acdcfd4a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,11 +26,14 @@ lto = "fat" strip = true panic = "abort" -# Local patched tract-linalg with wasm_f32_4x1 GEMV kernel. -# Adds a 4-row × 1-col SIMD kernel handling N=1 matrix-vector ops without the -# 75% column-tile waste of the existing wasm_f32_4x4. Source at -# /Users/CZoli/Desktop/tract-fork/tract-linalg-0.22.1/src/wasm.rs. +# Patched tract-linalg with the WASM SIMD kernel kit: +# wasm_f32_4x4 (existing) + wasm_f32_4x1 / 8x1 / 16x1 (new GEMV variants) +# + wasm_f32_8x4 / 8x8 (new MM variants) + per-M dispatcher in Ops::mmv_f32. +# Cumulative impact on DFN3: RTF 0.1290 -> 0.0516 (-60%, 2.5x faster), bit-identical audio. +# Source: https://github.com/czoli1976/tract/tree/add-wasm-f32-full-kernel-kit +# Tracking: czoli1976/tract#2 (full kit), czoli1976/tract#1 (upstream-PR-ready 4x1 only), +# sonos/tract#2161 (upstream issue). [patch.crates-io] -tract-linalg = { path = "/Users/CZoli/Desktop/tract-fork/tract-linalg-0.22.1" } +tract-linalg = { git = "https://github.com/czoli1976/tract", branch = "add-wasm-f32-full-kernel-kit" } From 8153d23fc1f295e9327f9bcf865c6b7b5eee9387 Mon Sep 17 00:00:00 2001 From: czoli1976 <64466170+czoli1976@users.noreply.github.com> Date: Tue, 28 Apr 2026 09:15:51 +0100 Subject: [PATCH 03/10] workspace Cargo.toml: drop 8x4 from kit comment (kernel was removed upstream-side after A/B) Mirrors czoli1976/tract@9c45f8d which dropped wasm_f32_8x4 after a controlled A/B confirmed it's structurally dead code for DFN3 (every MM op has N >= 8, strategizer always picks 8x8 over 8x4; mean A/B delta was -1.22% within thermal noise). Final kit shipped via [patch.crates-io]: wasm_f32_4x4 (existing) + wasm_f32_4x1 / 8x1 / 16x1 (new GEMV) + wasm_f32_8x8 (new MM) + per-M dispatcher in Ops::mmv_f32. --- Cargo.toml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index acdcfd4a8..00270c1c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,11 +28,10 @@ panic = "abort" # Patched tract-linalg with the WASM SIMD kernel kit: # wasm_f32_4x4 (existing) + wasm_f32_4x1 / 8x1 / 16x1 (new GEMV variants) -# + wasm_f32_8x4 / 8x8 (new MM variants) + per-M dispatcher in Ops::mmv_f32. +# + wasm_f32_8x8 (new MM variant) + per-M dispatcher in Ops::mmv_f32. # Cumulative impact on DFN3: RTF 0.1290 -> 0.0516 (-60%, 2.5x faster), bit-identical audio. # Source: https://github.com/czoli1976/tract/tree/add-wasm-f32-full-kernel-kit -# Tracking: czoli1976/tract#2 (full kit), czoli1976/tract#1 (upstream-PR-ready 4x1 only), -# sonos/tract#2161 (upstream issue). +# Tracking: czoli1976/tract#2 (kernel kit), sonos/tract#2161 (upstream issue). [patch.crates-io] tract-linalg = { git = "https://github.com/czoli1976/tract", branch = "add-wasm-f32-full-kernel-kit" } From c90060c43069aeaf8fcef12ee79c19f010ec138c Mon Sep 17 00:00:00 2001 From: czoli1976 <64466170+czoli1976@users.noreply.github.com> Date: Tue, 28 Apr 2026 09:28:30 +0100 Subject: [PATCH 04/10] workspace Cargo.toml: pin tract-linalg patch to rev b82d1f0 (instead of branch) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pins the [patch.crates-io] override to a specific commit SHA rather than a branch, so libDF builds are reproducible while the upstream PR is under review. Any further commits we push to the branch (in response to review feedback) will not auto-propagate to libDF builds — we'll bump the rev deliberately when we want to absorb them. Pinned commit (b82d1f0): full kernel kit (4x1, 8x1, 16x1, 8x8 + per-M dispatcher), no 8x4, with module-level #![allow(unsafe_op_in_unsafe_fn)]. --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 00270c1c8..0df6e4d78 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,6 @@ panic = "abort" # Source: https://github.com/czoli1976/tract/tree/add-wasm-f32-full-kernel-kit # Tracking: czoli1976/tract#2 (kernel kit), sonos/tract#2161 (upstream issue). [patch.crates-io] -tract-linalg = { git = "https://github.com/czoli1976/tract", branch = "add-wasm-f32-full-kernel-kit" } +tract-linalg = { git = "https://github.com/czoli1976/tract", rev = "b82d1f0" } From d089c96b7d7a6dd834a5060d394f7465827cad25 Mon Sep 17 00:00:00 2001 From: czoli1976 <64466170+czoli1976@users.noreply.github.com> Date: Tue, 28 Apr 2026 11:32:05 +0100 Subject: [PATCH 05/10] workspace Cargo.toml: bump tract-linalg patch rev to d925624 Picks up the upstream review fix (inner unsafe { } blocks per kali's request, plus cargo fmt). Rev d925624 is the new HEAD of czoli1976:add-wasm-f32-full-kernel-kit; the previous b82d1f0 was force-pushed away (rejected lint-allow approach). --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 0df6e4d78..1640e31c6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,6 @@ panic = "abort" # Source: https://github.com/czoli1976/tract/tree/add-wasm-f32-full-kernel-kit # Tracking: czoli1976/tract#2 (kernel kit), sonos/tract#2161 (upstream issue). [patch.crates-io] -tract-linalg = { git = "https://github.com/czoli1976/tract", rev = "b82d1f0" } +tract-linalg = { git = "https://github.com/czoli1976/tract", rev = "d925624" } From d222c7b62e5095e1fd4cbc4b252924a9d9ecbbb0 Mon Sep 17 00:00:00 2001 From: czoli1976 <64466170+czoli1976@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:43:38 +0100 Subject: [PATCH 06/10] fix(deps): vendor tract-linalg-0.22.1 with kernel kit (path-based [patch.crates-io]) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the broken `[patch.crates-io] tract-linalg = { git = ..., rev = "d925624" }` directive (which silently fell back to stock crates.io tract-linalg-0.22.1 because the git ref was at version 0.23.0-pre — version mismatch with libDF's =0.22.1 constraint, so cargo ignored the patch). Replacement: vendor a self-contained single-crate copy of sonos/tract@v0.22.1's `linalg/` subcrate at `vendor/tract-linalg-0.22.1/`, with the kernel kit cherry-picks applied (sonos/tract#2164's 4 commits: 4x1 GEMV, kit extension, 8x4 drop, review-feedback inner-unsafe-blocks). The vendor crate is at version 0.22.1 (matches libDF's =0.22.1 requirement) and is single-crate (its tract-data dep comes from crates.io, avoiding the multi-workspace tract-data version conflict that caused the previous git+branch attempt to fail with 103 type-mismatch errors). Validated: - Cargo.lock now has ONE tract-linalg entry (path-resolved, 0.22.1) - WASM size: 10,007,237 bytes raw (was 9,971,007 with stock kernel-kit-free fallback — +36 KB confirms kernel kit symbols are linked) - Bench: RTF 0.0516, per-frame mean 0.5157 ms (matches prior kernel-kit best; was 0.108 with broken patch falling back to stock 4x4-only) Drop this vendor dir entirely once tract publishes a release including the merged kernel kit (sonos/tract#2164 was merged 2026-04-28; awaiting 0.22.x cherry-pick or 0.23.x cut). --- Cargo.toml | 7 +- vendor/tract-linalg-0.22.1/.cargo-ok | 1 + .../tract-linalg-0.22.1/.cargo_vcs_info.json | 7 + vendor/tract-linalg-0.22.1/Cargo.toml | 224 +++ vendor/tract-linalg-0.22.1/Cargo.toml.orig | 125 ++ vendor/tract-linalg-0.22.1/LICENSE | 12 + vendor/tract-linalg-0.22.1/LICENSE-APACHE | 201 ++ vendor/tract-linalg-0.22.1/LICENSE-MIT | 23 + vendor/tract-linalg-0.22.1/README.md | 27 + .../armv7neon_mmm_f32_32x1_core.tmpl | 204 ++ .../armv7neon/armv7neon_mmm_f32_8x1_core.tmpl | 98 + .../armv7neon/armv7neon_mmm_f32_8x4_core.tmpl | 143 ++ .../armv7neon/armv7neon_mmm_f32_8x6_core.tmpl | 158 ++ .../armv7neon_mmm_f32_per_cols.tmpliq | 9 + .../armv7neon_mmm_f32_per_rows.tmpliq | 9 + .../armv7neon_mmm_f32_scalars.tmpliq | 24 + .../armv7neon/armv7neon_mmm_i32_32x1.tmpl | 174 ++ .../armv7neon/armv7neon_mmm_i32_8x4.tmpl | 294 +++ .../armv7neon_mmm_i32_per_cols.tmpliq | 8 + .../armv7neon_mmm_i32_per_rows.tmpliq | 8 + .../armv7neon_mmm_i32_scalars.tmpliq | 20 + .../armv7neon_mmm_i32_scale_q8_q15.tmpliq | 232 +++ .../armv7neon/armv7neon_mmm_q_per_col.tmpliq | 33 + .../armv7neon/armv7neon_mmm_q_per_row.tmpliq | 24 + .../armv7neon/armv7neon_mmm_q_scalar.tmpliq | 15 + .../arm32/armv7neon/armv7neon_prefetch.tmpl | 22 + .../armv7neon/armv7neon_sigmoid_f32_4n.tmpl | 215 +++ .../armv7neon/armv7neon_tanh_f32_4n.tmpl | 209 ++ .../arm32/armv7neon/dispatcher.tmpliq | 38 + .../arm32/armvfpv2/armvfpv2_mmm_f32_4x4.tmpl | 491 +++++ .../arm32/armvfpv2/dispatcher.tmpliq | 32 + .../apple_amx/apple_amx_mmm_f16_64x1.tmpl | 533 ++++++ .../apple_amx/apple_amx_mmm_f16_64x32.tmpl | 658 +++++++ .../apple_amx/apple_amx_mmm_f32_32x1.tmpl | 533 ++++++ .../apple_amx/apple_amx_mmm_f32_32x32.tmpl | 764 ++++++++ .../arm64/apple_amx/dispatcher.tmpliq | 37 + .../arm64/apple_amx/instructions.rs | 191 ++ .../arm64fp16_leaky_relu_f16_8n.tmpl | 71 + .../arm64fp16/arm64fp16_mmm_8h_per_col.tmpliq | 41 + .../arm64fp16/arm64fp16_mmm_8h_per_row.tmpliq | 25 + .../arm64fp16/arm64fp16_mmm_8h_scalar.tmpliq | 18 + .../loop1/cortex_a53.tmpli | 65 + .../arm64fp16_mmm_f16_128x1/loop1/naive.tmpli | 32 + .../loop2/cortex_a55.tmpli | 85 + .../arm64fp16_mmm_f16_128x1_core.tmpl | 203 ++ .../arm64fp16_mmm_f16_16x8/loop1/naive.tmpli | 21 + .../loop2/cortex_a55.tmpli | 54 + .../arm64fp16_mmm_f16_16x8_core.tmpl | 174 ++ .../arm64fp16_mmm_f16_32x4/loop1/naive.tmpli | 21 + .../loop2/cortex_a55.tmpli | 71 + .../arm64fp16_mmm_f16_32x4_core.tmpl | 165 ++ .../arm64fp16_mmm_f16_32x6.core.tmpl | 148 ++ .../arm64fp16_mmm_f16_64x1.core.tmpl | 264 +++ .../arm64fp16_mmm_f16_64x3.core.tmpl | 165 ++ .../arm64fp16_mmm_f16_per_cols.tmpliq | 9 + .../arm64fp16_mmm_f16_per_rows.tmpliq | 9 + .../arm64fp16_mmm_f16_scalars.tmpliq | 36 + .../arm64fp16/arm64fp16_mmm_load_tile.tmpliq | 10 + .../arm64fp16/arm64fp16_sigmoid_f16_8n.tmpl | 131 ++ .../arm64fp16/arm64fp16_tanh_f16_8n.tmpl | 124 ++ .../arm64/arm64fp16/dispatcher.tmpliq | 37 + .../arm64/arm64fp16/dummy_fmla_no_pragma.S | 13 + .../arm64/arm64fp16/dummy_fmla_pragma.S | 13 + .../arm64simd/arm64simd_mmm_4s_per_col.tmpliq | 36 + .../arm64simd/arm64simd_mmm_4s_per_row.tmpliq | 25 + .../arm64simd/arm64simd_mmm_4s_scalar.tmpliq | 18 + .../ldr_w_no_preload.tmpli | 69 + .../packed_packed_loop1/ldr_w_preload.tmpli | 82 + .../packed_packed_loop1/ldr_x_preload.tmpli | 60 + .../packed_packed_loop1/naive.tmpli | 34 + .../packed_packed_loop2/cortex_a55.tmpli | 107 ++ .../arm64simd_mmm_f32_12x8_core.tmpl | 163 ++ .../packed_packed_loop1/cortex_a53.tmpli | 45 + .../packed_packed_loop1/naive.tmpli | 21 + .../packed_packed_loop2/cortex_a55.tmpli | 73 + .../arm64simd_mmm_f32_16x4_core.tmpl | 174 ++ .../loop2/cortex_a55.tmpli | 73 + .../packed_packed_loop1/cortex_a53.tmpli | 63 + .../packed_packed_loop1/cortex_a55.tmpli | 53 + .../packed_packed_loop1/naive.tmpli | 31 + .../arm64simd_mmm_f32_24x4_core.tmpl | 185 ++ .../arm64simd_mmm_f32_32x1_core.tmpl | 403 ++++ .../arm64simd_mmm_f32_32x3_core.tmpl | 307 +++ .../loop1/cortex_a53.tmpli | 65 + .../arm64simd_mmm_f32_64x1/loop1/naive.tmpli | 32 + .../loop2/cortex_a55.tmpli | 85 + .../arm64simd_mmm_f32_64x1/loop2/naive.tmpli | 66 + .../arm64simd_mmm_f32_64x1_core.tmpl | 225 +++ .../packed_packed_loop1/broken_chains.tmpli | 25 + .../ldr_w_no_preload.tmpli | 51 + .../packed_packed_loop1/ldr_w_preload.tmpli | 54 + .../ldr_x_no_preload.tmpli | 35 + .../packed_packed_loop1/ldr_x_preload.tmpli | 43 + .../packed_packed_loop1/naive.tmpli | 21 + .../packed_packed_loop2/broken_chains.tmpli | 41 + .../packed_packed_loop2/cortex_a55.tmpli | 60 + .../arm64simd/arm64simd_mmm_f32_8x8_core.tmpl | 182 ++ .../arm64simd_mmm_f32_per_cols.tmpliq | 9 + .../arm64simd_mmm_f32_per_rows.tmpliq | 9 + .../arm64simd_mmm_f32_scalars.tmpliq | 36 + .../arm64simd/arm64simd_mmm_i32_64x1.tmpl | 180 ++ .../arm64simd/arm64simd_mmm_i32_8x8.tmpl | 234 +++ .../arm64simd_mmm_i32_per_cols.tmpliq | 8 + .../arm64simd_mmm_i32_per_rows.tmpliq | 8 + .../arm64simd_mmm_i32_scalars.tmpliq | 31 + .../arm64simd_mmm_i32_scale_q16_q31.tmpliq | 267 +++ .../arm64simd/arm64simd_mmm_load_tile.tmpliq | 10 + .../arm64simd/arm64simd_sigmoid_f32_4n.tmpl | 206 ++ .../arm64simd/arm64simd_tanh_f32_4n.tmpl | 198 ++ .../arm64/arm64simd/dispatcher.tmpliq | 37 + .../tract-linalg-0.22.1/benches/arm32neon.rs | 179 ++ vendor/tract-linalg-0.22.1/benches/arm64.rs | 77 + .../tract-linalg-0.22.1/benches/arm64simd.rs | 926 +++++++++ vendor/tract-linalg-0.22.1/benches/intel.rs | 200 ++ .../tract-linalg-0.22.1/benches/leaky_relu.rs | 63 + vendor/tract-linalg-0.22.1/benches/mat_vec.rs | 46 + .../benches/mm_for_asr_am.rs | 37 + .../benches/mm_for_inception.rs | 45 + .../benches/mm_for_wavenet_hw.rs | 12 + vendor/tract-linalg-0.22.1/benches/sigmoid.rs | 22 + vendor/tract-linalg-0.22.1/benches/softmax.rs | 110 ++ vendor/tract-linalg-0.22.1/benches/utils.rs | 92 + .../benches/virtual_im2col.rs | 47 + vendor/tract-linalg-0.22.1/benches/x86_64.rs | 242 +++ vendor/tract-linalg-0.22.1/build.rs | 374 ++++ vendor/tract-linalg-0.22.1/src/arm32.rs | 101 + .../src/arm32/armv7neon.rs | 46 + .../tract-linalg-0.22.1/src/arm32/armvfpv2.rs | 11 + .../src/arm32/cortex_a7.rs | 16 + .../src/arm32/cortex_a7.txt | 1701 +++++++++++++++++ .../src/arm32/cortex_a9.rs | 16 + .../src/arm32/cortex_a9.txt | 1701 +++++++++++++++++ vendor/tract-linalg-0.22.1/src/arm64.rs | 383 ++++ .../src/arm64/apple_amx.rs | 32 + .../src/arm64/arm64fp16.rs | 64 + .../src/arm64/arm64fp16/by_scalar.rs | 258 +++ .../src/arm64/arm64fp16/leaky_relu.rs | 56 + .../src/arm64/arm64fp16/max.rs | 63 + .../src/arm64/arm64fp16/panel_extract.rs | 94 + .../src/arm64/arm64fp16/sum.rs | 62 + .../src/arm64/arm64fp16/unicast.rs | 271 +++ .../src/arm64/arm64simd.rs | 117 ++ .../src/arm64/arm64simd/by_scalar.rs | 202 ++ .../src/arm64/arm64simd/leaky_relu.rs | 50 + .../src/arm64/arm64simd/max.rs | 52 + .../src/arm64/arm64simd/panel_extract.rs | 98 + .../src/arm64/arm64simd/softmax.rs | 110 ++ .../src/arm64/arm64simd/sum.rs | 59 + .../src/arm64/arm64simd/unicast.rs | 233 +++ .../src/arm64/cortex_a53.rs | 16 + .../src/arm64/cortex_a55.rs | 16 + .../src/arm64/cortex_a72.rs | 4 + .../src/arm64/cortex_a73.rs | 4 + .../src/frame/block_quant/helpers.rs | 65 + .../src/frame/block_quant/mod.rs | 327 ++++ .../src/frame/block_quant/q4_0.rs | 509 +++++ .../src/frame/block_quant/value.rs | 116 ++ .../src/frame/by_scalar.rs | 96 + .../src/frame/element_wise.rs | 165 ++ .../src/frame/element_wise_helper.rs | 169 ++ .../src/frame/leaky_relu.rs | 65 + vendor/tract-linalg-0.22.1/src/frame/lut.rs | 141 ++ .../src/frame/mmm/cost_model.rs | 86 + .../tract-linalg-0.22.1/src/frame/mmm/fuse.rs | 125 ++ .../src/frame/mmm/input_store.rs | 179 ++ .../src/frame/mmm/kernel.rs | 159 ++ .../src/frame/mmm/macros.rs | 124 ++ .../tract-linalg-0.22.1/src/frame/mmm/mod.rs | 307 +++ .../src/frame/mmm/panel_extract.rs | 300 +++ .../src/frame/mmm/scratch.rs | 529 +++++ .../src/frame/mmm/storage.rs | 139 ++ .../src/frame/mmm/tests/frame.rs | 295 +++ .../src/frame/mmm/tests/fuse.rs | 287 +++ .../src/frame/mmm/tests/mod.rs | 89 + .../src/frame/mmm/tests/packed_packed.rs | 382 ++++ .../src/frame/mmm/tests/q_scale.rs | 176 ++ .../src/frame/mmm/tests/store.rs | 131 ++ vendor/tract-linalg-0.22.1/src/frame/mod.rs | 25 + vendor/tract-linalg-0.22.1/src/frame/pack.rs | 1015 ++++++++++ .../src/frame/reduce/max.rs | 42 + .../src/frame/reduce/mod.rs | 300 +++ .../src/frame/reduce/softmax.rs | 86 + .../src/frame/reduce/sum.rs | 54 + .../tract-linalg-0.22.1/src/frame/sigmoid.rs | 96 + vendor/tract-linalg-0.22.1/src/frame/tanh.rs | 101 + .../tract-linalg-0.22.1/src/frame/unicast.rs | 233 +++ .../tract-linalg-0.22.1/src/frame/weights.rs | 80 + vendor/tract-linalg-0.22.1/src/generic.rs | 55 + .../src/generic/by_scalar.rs | 181 ++ vendor/tract-linalg-0.22.1/src/generic/erf.rs | 51 + .../src/generic/leaky_relu.rs | 74 + vendor/tract-linalg-0.22.1/src/generic/lut.rs | 47 + vendor/tract-linalg-0.22.1/src/generic/mmm.rs | 453 +++++ .../tract-linalg-0.22.1/src/generic/reduce.rs | 187 ++ .../src/generic/rounding.rs | 524 +++++ .../src/generic/sigmoid.rs | 138 ++ .../tract-linalg-0.22.1/src/generic/tanh.rs | 133 ++ .../src/generic/unicast.rs | 194 ++ .../src/hwbench/bandwidth.rs | 159 ++ vendor/tract-linalg-0.22.1/src/hwbench/mod.rs | 4 + .../tract-linalg-0.22.1/src/hwbench/runner.rs | 122 ++ vendor/tract-linalg-0.22.1/src/lib.rs | 404 ++++ vendor/tract-linalg-0.22.1/src/multithread.rs | 57 + vendor/tract-linalg-0.22.1/src/wasm.rs | 1664 ++++++++++++++++ .../src/wasm.rs.before-fma | 1664 ++++++++++++++++ .../tract-linalg-0.22.1/src/wasm.rs.with-8x4 | 1555 +++++++++++++++ vendor/tract-linalg-0.22.1/src/x86_64_fma.rs | 49 + .../src/x86_64_fma/by_scalar.rs | 56 + .../src/x86_64_fma/intel.rs | 5 + .../tract-linalg-0.22.1/src/x86_64_fma/max.rs | 67 + .../tract-linalg-0.22.1/src/x86_64_fma/mmm.rs | 172 ++ .../src/x86_64_fma/panel_extract.rs | 136 ++ .../src/x86_64_fma/softmax.rs | 121 ++ .../tests/virtual_im2col.rs | 545 ++++++ .../packed_packed_loop1/avx-512-unroll.tmpli | 59 + .../10x1/packed_packed_loop1/avx-512.tmpli | 33 + .../1x1/packed_packed_loop1/avx-512.tmpli | 7 + .../1x1/packed_packed_loop1/unroll-16.tmpli | 68 + .../1x1/packed_packed_loop1/unroll-4.tmpli | 24 + .../1x1/packed_packed_loop1/unroll-8.tmpli | 29 + .../1x1/packed_packed_loop1/unroll.tmpli | 11 + .../1x12/packed_packed_loop1/avx-512.tmpli | 45 + .../packed_packed_loop1/avx-512-unroll.tmpli | 53 + .../2x5/packed_packed_loop1/avx-512.tmpli | 30 + .../packed_packed_loop1/avx-512-unroll.tmpli | 71 + .../2x6/packed_packed_loop1/avx-512.tmpli | 39 + .../packed_packed_loop1/avx-512-unroll.tmpli | 63 + .../3x4/packed_packed_loop1/avx-512.tmpli | 35 + .../packed_packed_loop1/avx-512-unroll.tmpli | 69 + .../4x3/packed_packed_loop1/avx-512.tmpli | 38 + .../packed_packed_loop1/avx-512-unroll.tmpli | 63 + .../5x2/packed_packed_loop1/avx-512.tmpli | 34 + .../packed_packed_loop1/avx-512-unroll.tmpli | 25 + .../6x1/packed_packed_loop1/avx-512.tmpli | 29 + .../packed_packed_loop1/avx-512-unroll.tmpli | 70 + .../6x2/packed_packed_loop1/avx-512.tmpli | 38 + .../packed_packed_loop1/avx-512-unroll.tmpli | 40 + .../7x1/packed_packed_loop1/avx-512.tmpli | 21 + .../packed_packed_loop1/avx-512-unroll.tmpli | 30 + .../8x1/packed_packed_loop1/avx-512.tmpli | 25 + .../8x2/packed_packed_loop1/avx-512.tmpli | 42 + .../packed_packed_loop1/avx-512-unroll.tmpli | 61 + .../8x8/packed_packed_loop1/avx-512.tmpli | 33 + .../x86_64/avx512/avx512_mmm_f32_128x1.tmpl | 110 ++ .../x86_64/avx512/avx512_mmm_f32_16x1.tmpl | 143 ++ .../x86_64/avx512/avx512_mmm_f32_16x12.tmpl | 165 ++ .../x86_64/avx512/avx512_mmm_f32_16x8.tmpl | 143 ++ .../x86_64/avx512/avx512_mmm_f32_32x5.tmpl | 144 ++ .../x86_64/avx512/avx512_mmm_f32_32x6.tmpl | 161 ++ .../x86_64/avx512/avx512_mmm_f32_48x4.tmpl | 148 ++ .../x86_64/avx512/avx512_mmm_f32_64x3.tmpl | 149 ++ .../x86_64/avx512/avx512_mmm_f32_80x2.tmpl | 148 ++ .../x86_64/avx512/avx512_mmm_load_tile.tmpliq | 9 + .../x86_64/avx512/dispatcher.tmpliq | 40 + .../x86_64/avx512/f32_per_cols.tmpliq | 8 + .../x86_64/avx512/f32_per_rows.tmpliq | 8 + .../x86_64/avx512/f32_scalars.tmpliq | 29 + .../x86_64/avx512/i32_per_cols.tmpliq | 8 + .../x86_64/avx512/i32_per_rows.tmpliq | 8 + .../x86_64/avx512/i32_scalars.tmpliq | 10 + .../x86_64/avx512/postamble.tmpliq | 38 + .../x86_64/avx512/preamble.tmpliq | 63 + .../x86_64/avx512/sigmoid_f32.tmpl | 324 ++++ .../x86_64/avx512/tanh_f32.tmpl | 313 +++ .../x86_64/avx512/zmm_per_col.tmpliq | 29 + .../x86_64/avx512/zmm_per_row.tmpliq | 23 + .../x86_64/avx512/zmm_scalar.tmpliq | 15 + .../10x1/packed_packed_loop1/avx-unroll.tmpli | 58 + .../fma/10x1/packed_packed_loop1/avx.tmpli | 33 + .../2x5/packed_packed_loop1/avx-unroll.tmpli | 52 + .../fma/2x5/packed_packed_loop1/avx.tmpli | 30 + .../packed_packed_loop1/original-unroll.tmpli | 71 + .../2x6/packed_packed_loop1/original.tmpli | 39 + .../3x4/packed_packed_loop1/avx-unroll.tmpli | 60 + .../fma/3x4/packed_packed_loop1/avx.tmpli | 32 + .../4x3/packed_packed_loop1/avx-unroll.tmpli | 69 + .../fma/4x3/packed_packed_loop1/avx.tmpli | 38 + .../5x2/packed_packed_loop1/avx-unroll.tmpli | 63 + .../fma/5x2/packed_packed_loop1/avx.tmpli | 34 + .../6x1/packed_packed_loop1/avx-unroll.tmpli | 25 + .../fma/6x1/packed_packed_loop1/avx.tmpli | 29 + .../6x2/packed_packed_loop1/avx-unroll.tmpli | 70 + .../fma/6x2/packed_packed_loop1/avx.tmpli | 38 + .../7x1/packed_packed_loop1/avx-unroll.tmpli | 37 + .../fma/7x1/packed_packed_loop1/avx.tmpli | 22 + .../8x1/packed_packed_loop1/avx-unroll.tmpli | 48 + .../fma/8x1/packed_packed_loop1/avx.tmpli | 33 + .../8x8/packed_packed_loop1/avx-unroll.tmpli | 58 + .../fma/8x8/packed_packed_loop1/avx.tmpli | 30 + .../x86_64/fma/avx2_mmm_i32_8x8.tmpl | 682 +++++++ .../x86_64/fma/dispatcher.tmpliq | 40 + .../x86_64/fma/fma_mmm_f32_16x5.tmpl | 143 ++ .../x86_64/fma/fma_mmm_f32_16x6.tmpl | 131 ++ .../x86_64/fma/fma_mmm_f32_24x4.tmpl | 158 ++ .../x86_64/fma/fma_mmm_f32_32x1.tmpl | 368 ++++ .../x86_64/fma/fma_mmm_f32_32x3.tmpl | 239 +++ .../x86_64/fma/fma_mmm_f32_40x2.tmpl | 158 ++ .../x86_64/fma/fma_mmm_f32_64x1.tmpl | 142 ++ .../x86_64/fma/fma_mmm_f32_8x8.tmpl | 129 ++ .../x86_64/fma/fma_mmm_f32_per_cols.tmpliq | 9 + .../x86_64/fma/fma_mmm_f32_per_rows.tmpliq | 9 + .../x86_64/fma/fma_mmm_f32_scalars.tmpliq | 38 + .../x86_64/fma/fma_mmm_i32_per_cols.tmpliq | 9 + .../x86_64/fma/fma_mmm_i32_per_rows.tmpliq | 9 + .../x86_64/fma/fma_mmm_i32_scalars.tmpliq | 23 + .../x86_64/fma/fma_mmm_load_tile.tmpliq | 9 + .../x86_64/fma/fma_mmm_ymm_per_col.tmpliq | 35 + .../x86_64/fma/fma_mmm_ymm_per_row.tmpliq | 32 + .../x86_64/fma/fma_mmm_ymm_scalar.tmpliq | 22 + .../x86_64/fma/fma_sigmoid_f32.tmpl | 319 ++++ .../x86_64/fma/fma_tanh_f32.tmpl | 313 +++ .../x86_64/fma/postamble.tmpliq | 38 + .../x86_64/fma/preamble.tmpliq | 64 + 313 files changed, 43672 insertions(+), 3 deletions(-) create mode 100644 vendor/tract-linalg-0.22.1/.cargo-ok create mode 100644 vendor/tract-linalg-0.22.1/.cargo_vcs_info.json create mode 100644 vendor/tract-linalg-0.22.1/Cargo.toml create mode 100644 vendor/tract-linalg-0.22.1/Cargo.toml.orig create mode 100644 vendor/tract-linalg-0.22.1/LICENSE create mode 100644 vendor/tract-linalg-0.22.1/LICENSE-APACHE create mode 100644 vendor/tract-linalg-0.22.1/LICENSE-MIT create mode 100644 vendor/tract-linalg-0.22.1/README.md create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_32x1_core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x1_core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x4_core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x6_core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_cols.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_rows.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_scalars.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_32x1.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_8x4.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_cols.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_rows.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scalars.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scale_q8_q15.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_col.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_row.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_scalar.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_prefetch.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_sigmoid_f32_4n.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_tanh_f32_4n.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/dispatcher.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm32/armvfpv2/armvfpv2_mmm_f32_4x4.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm32/armvfpv2/dispatcher.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x1.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x32.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x1.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x32.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/apple_amx/dispatcher.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/apple_amx/instructions.rs create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_leaky_relu_f16_8n.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_col.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_row.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_scalar.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/cortex_a53.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/naive.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop2/cortex_a55.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1_core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop1/naive.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop2/cortex_a55.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8_core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop1/naive.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop2/cortex_a55.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4_core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x6.core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x1.core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x3.core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_cols.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_rows.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_scalars.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_load_tile.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_sigmoid_f16_8n.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_tanh_f16_8n.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/dispatcher.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_no_pragma.S create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_pragma.S create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_col.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_row.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_scalar.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_no_preload.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_preload.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/naive.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop2/cortex_a55.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8_core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/naive.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop2/cortex_a55.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4_core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/loop2/cortex_a55.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a55.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/naive.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4_core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x1_core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x3_core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/cortex_a53.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/naive.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/cortex_a55.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/naive.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1_core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/broken_chains.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_no_preload.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_preload.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_no_preload.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_preload.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/naive.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/broken_chains.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/cortex_a55.tmpli create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8_core.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_cols.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_rows.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_scalars.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_64x1.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_8x8.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_cols.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_rows.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scalars.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scale_q16_q31.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_load_tile.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_sigmoid_f32_4n.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_tanh_f32_4n.tmpl create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/dispatcher.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/benches/arm32neon.rs create mode 100644 vendor/tract-linalg-0.22.1/benches/arm64.rs create mode 100644 vendor/tract-linalg-0.22.1/benches/arm64simd.rs create mode 100644 vendor/tract-linalg-0.22.1/benches/intel.rs create mode 100644 vendor/tract-linalg-0.22.1/benches/leaky_relu.rs create mode 100644 vendor/tract-linalg-0.22.1/benches/mat_vec.rs create mode 100644 vendor/tract-linalg-0.22.1/benches/mm_for_asr_am.rs create mode 100644 vendor/tract-linalg-0.22.1/benches/mm_for_inception.rs create mode 100644 vendor/tract-linalg-0.22.1/benches/mm_for_wavenet_hw.rs create mode 100644 vendor/tract-linalg-0.22.1/benches/sigmoid.rs create mode 100644 vendor/tract-linalg-0.22.1/benches/softmax.rs create mode 100644 vendor/tract-linalg-0.22.1/benches/utils.rs create mode 100644 vendor/tract-linalg-0.22.1/benches/virtual_im2col.rs create mode 100644 vendor/tract-linalg-0.22.1/benches/x86_64.rs create mode 100644 vendor/tract-linalg-0.22.1/build.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm32.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm32/armv7neon.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm32/armvfpv2.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.txt create mode 100644 vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.txt create mode 100644 vendor/tract-linalg-0.22.1/src/arm64.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/apple_amx.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64fp16.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/by_scalar.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/leaky_relu.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/max.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/panel_extract.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/sum.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/unicast.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd/by_scalar.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd/leaky_relu.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd/max.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd/panel_extract.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd/softmax.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd/sum.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd/unicast.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/cortex_a53.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/cortex_a55.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/cortex_a72.rs create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/cortex_a73.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/block_quant/helpers.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/block_quant/mod.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/block_quant/q4_0.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/block_quant/value.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/by_scalar.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/element_wise.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/element_wise_helper.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/leaky_relu.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/lut.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/cost_model.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/fuse.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/input_store.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/kernel.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/macros.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/mod.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/panel_extract.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/scratch.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/storage.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/tests/frame.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/tests/fuse.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/tests/mod.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/tests/packed_packed.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/tests/q_scale.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/tests/store.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mod.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/pack.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/reduce/max.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/reduce/mod.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/reduce/softmax.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/reduce/sum.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/sigmoid.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/tanh.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/unicast.rs create mode 100644 vendor/tract-linalg-0.22.1/src/frame/weights.rs create mode 100644 vendor/tract-linalg-0.22.1/src/generic.rs create mode 100644 vendor/tract-linalg-0.22.1/src/generic/by_scalar.rs create mode 100644 vendor/tract-linalg-0.22.1/src/generic/erf.rs create mode 100644 vendor/tract-linalg-0.22.1/src/generic/leaky_relu.rs create mode 100644 vendor/tract-linalg-0.22.1/src/generic/lut.rs create mode 100644 vendor/tract-linalg-0.22.1/src/generic/mmm.rs create mode 100644 vendor/tract-linalg-0.22.1/src/generic/reduce.rs create mode 100644 vendor/tract-linalg-0.22.1/src/generic/rounding.rs create mode 100644 vendor/tract-linalg-0.22.1/src/generic/sigmoid.rs create mode 100644 vendor/tract-linalg-0.22.1/src/generic/tanh.rs create mode 100644 vendor/tract-linalg-0.22.1/src/generic/unicast.rs create mode 100644 vendor/tract-linalg-0.22.1/src/hwbench/bandwidth.rs create mode 100644 vendor/tract-linalg-0.22.1/src/hwbench/mod.rs create mode 100644 vendor/tract-linalg-0.22.1/src/hwbench/runner.rs create mode 100644 vendor/tract-linalg-0.22.1/src/lib.rs create mode 100644 vendor/tract-linalg-0.22.1/src/multithread.rs create mode 100644 vendor/tract-linalg-0.22.1/src/wasm.rs create mode 100644 vendor/tract-linalg-0.22.1/src/wasm.rs.before-fma create mode 100644 vendor/tract-linalg-0.22.1/src/wasm.rs.with-8x4 create mode 100644 vendor/tract-linalg-0.22.1/src/x86_64_fma.rs create mode 100644 vendor/tract-linalg-0.22.1/src/x86_64_fma/by_scalar.rs create mode 100644 vendor/tract-linalg-0.22.1/src/x86_64_fma/intel.rs create mode 100644 vendor/tract-linalg-0.22.1/src/x86_64_fma/max.rs create mode 100644 vendor/tract-linalg-0.22.1/src/x86_64_fma/mmm.rs create mode 100644 vendor/tract-linalg-0.22.1/src/x86_64_fma/panel_extract.rs create mode 100644 vendor/tract-linalg-0.22.1/src/x86_64_fma/softmax.rs create mode 100644 vendor/tract-linalg-0.22.1/tests/virtual_im2col.rs create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_128x1.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x1.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x12.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x8.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x5.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x6.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_48x4.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_64x3.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_80x2.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_load_tile.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/dispatcher.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_cols.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_rows.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/f32_scalars.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_cols.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_rows.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/i32_scalars.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/postamble.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/preamble.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/sigmoid_f32.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/tanh_f32.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_col.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_row.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_scalar.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx-unroll.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx.tmpli create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/avx2_mmm_i32_8x8.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/dispatcher.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x5.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x6.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_24x4.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x1.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x3.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_40x2.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_64x1.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_8x8.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_cols.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_rows.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_scalars.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_cols.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_rows.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_scalars.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_load_tile.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_col.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_row.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_scalar.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_sigmoid_f32.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_tanh_f32.tmpl create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/postamble.tmpliq create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/preamble.tmpliq diff --git a/Cargo.toml b/Cargo.toml index 1640e31c6..3702d226b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,9 +30,10 @@ panic = "abort" # wasm_f32_4x4 (existing) + wasm_f32_4x1 / 8x1 / 16x1 (new GEMV variants) # + wasm_f32_8x8 (new MM variant) + per-M dispatcher in Ops::mmv_f32. # Cumulative impact on DFN3: RTF 0.1290 -> 0.0516 (-60%, 2.5x faster), bit-identical audio. -# Source: https://github.com/czoli1976/tract/tree/add-wasm-f32-full-kernel-kit -# Tracking: czoli1976/tract#2 (kernel kit), sonos/tract#2161 (upstream issue). +# Source: vendor/tract-linalg-0.22.1 (self-contained crate, version-matched 0.22.1). +# Lineage: cherry-picked from czoli1976/tract@kernel-kit-on-v0.22.1 — kernel kit +# from sonos/tract#2164 (merged) applied onto sonos/tract v0.22.1 tag. [patch.crates-io] -tract-linalg = { git = "https://github.com/czoli1976/tract", rev = "d925624" } +tract-linalg = { path = "vendor/tract-linalg-0.22.1" } diff --git a/vendor/tract-linalg-0.22.1/.cargo-ok b/vendor/tract-linalg-0.22.1/.cargo-ok new file mode 100644 index 000000000..5f8b79583 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/.cargo-ok @@ -0,0 +1 @@ +{"v":1} \ No newline at end of file diff --git a/vendor/tract-linalg-0.22.1/.cargo_vcs_info.json b/vendor/tract-linalg-0.22.1/.cargo_vcs_info.json new file mode 100644 index 000000000..9f37214c9 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/.cargo_vcs_info.json @@ -0,0 +1,7 @@ +{ + "git": { + "sha1": "88246b2a7e5b55df558a828b9c0c4815590620ce", + "dirty": true + }, + "path_in_vcs": "linalg" +} \ No newline at end of file diff --git a/vendor/tract-linalg-0.22.1/Cargo.toml b/vendor/tract-linalg-0.22.1/Cargo.toml new file mode 100644 index 000000000..d678cb319 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/Cargo.toml @@ -0,0 +1,224 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2024" +name = "tract-linalg" +version = "0.22.1" +authors = ["Mathieu Poumeyrol "] +build = "build.rs" +autolib = false +autobins = false +autoexamples = false +autotests = false +autobenches = false +description = "Tiny, no-nonsense, self contained, TensorFlow and ONNX inference" +readme = "README.md" +keywords = [ + "TensorFlow", + "NeuralNetworks", +] +categories = ["science"] +license = "MIT OR Apache-2.0" +repository = "https://github.com/snipsco/tract" +resolver = "2" + +[badges.maintenance] +status = "actively-developed" + +[features] +apple-amx-ios = [] +complex = ["tract-data/complex"] +default = [] +hwbench = ["rayon"] +multithread-mm = ["rayon"] +no_fp16 = [] + +[lib] +name = "tract_linalg" +path = "src/lib.rs" + +[[test]] +name = "virtual_im2col" +path = "tests/virtual_im2col.rs" + +[[bench]] +name = "arm32neon" +path = "benches/arm32neon.rs" +bench = false +harness = false + +[[bench]] +name = "arm64" +path = "benches/arm64.rs" +bench = false +harness = false + +[[bench]] +name = "arm64simd" +path = "benches/arm64simd.rs" +bench = false +harness = false + +[[bench]] +name = "intel" +path = "benches/intel.rs" +bench = false +harness = false + +[[bench]] +name = "leaky_relu" +path = "benches/leaky_relu.rs" +bench = false +harness = false + +[[bench]] +name = "mat_vec" +path = "benches/mat_vec.rs" +harness = false + +[[bench]] +name = "mm_for_asr_am" +path = "benches/mm_for_asr_am.rs" +harness = false + +[[bench]] +name = "mm_for_inception" +path = "benches/mm_for_inception.rs" +harness = false + +[[bench]] +name = "mm_for_wavenet_hw" +path = "benches/mm_for_wavenet_hw.rs" +harness = false + +[[bench]] +name = "sigmoid" +path = "benches/sigmoid.rs" +harness = false + +[[bench]] +name = "softmax" +path = "benches/softmax.rs" +harness = false + +[[bench]] +name = "virtual_im2col" +path = "benches/virtual_im2col.rs" +harness = false + +[[bench]] +name = "x86_64" +path = "benches/x86_64.rs" +bench = false +harness = false + +[dependencies.byteorder] +version = "1.4.3" + +[dependencies.derive-new] +version = "0.5.9" + +[dependencies.downcast-rs] +version = "1.2.0" + +[dependencies.dyn-clone] +version = "1.0.4" + +[dependencies.dyn-hash] +version = "0.2" + +[dependencies.lazy_static] +version = "1.5.0" + +[dependencies.log] +version = "0.4.14" + +[dependencies.num-traits] +version = "0.2.14" + +[dependencies.pastey] +version = "0.1" + +[dependencies.rayon] +version = "1.10" +optional = true + +[dependencies.scan_fmt] +version = "0.2.6" + +[dependencies.tract-data] +version = "=0.22.1" + +[dev-dependencies.core_affinity] +version = "0.8.0" + +[dev-dependencies.env_logger] +version = "0.10" + +[dev-dependencies.libc] +version = "0.2.164" + +[dev-dependencies.nu-ansi-term] +version = "0.46" + +[build-dependencies.cc] +version = "1.0.69" + +[build-dependencies.half] +version = ">=2.4,<3.0" +features = [ + "std", + "num-traits", +] + +[build-dependencies.liquid] +version = "0.26.8" + +[build-dependencies.liquid-core] +version = "0.26.8" + +[build-dependencies.liquid-derive] +version = "0.26.8" + +[build-dependencies.smallvec] +version = "1.6.1" + +[build-dependencies.time] +version = "0.3.23" + +[build-dependencies.unicode-normalization] +version = "0.1.19" + +[build-dependencies.walkdir] +version = "2.3.2" + +[target.'cfg(not(target_family = "wasm"))'.dev-dependencies.criterion] +version = "0.6" + +[target.'cfg(not(target_family = "wasm"))'.dev-dependencies.proptest] +version = "1.0.0" + +[target.'cfg(target_family = "wasm")'.dev-dependencies.criterion] +version = "0.6" +features = [ + "plotters", + "cargo_bench_support", +] +default-features = false + +[target.'cfg(target_family = "wasm")'.dev-dependencies.proptest] +version = "1.0.0" +features = [ + "std", + "bit-set", +] +default-features = false diff --git a/vendor/tract-linalg-0.22.1/Cargo.toml.orig b/vendor/tract-linalg-0.22.1/Cargo.toml.orig new file mode 100644 index 000000000..5e7551008 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/Cargo.toml.orig @@ -0,0 +1,125 @@ +[package] +name = "tract-linalg" +version = "0.22.1" +license = "MIT OR Apache-2.0" +authors = ["Mathieu Poumeyrol "] +description = "Tiny, no-nonsense, self contained, TensorFlow and ONNX inference" +repository = "https://github.com/snipsco/tract" +keywords = ["TensorFlow", "NeuralNetworks"] +categories = ["science"] +autobenches = false +edition = "2024" + +[badges] +maintenance = { status = "actively-developed" } + +[dependencies] +byteorder.workspace = true +derive-new.workspace = true +downcast-rs.workspace = true +dyn-clone.workspace = true +dyn-hash.workspace = true +lazy_static.workspace = true +log.workspace = true +num-traits.workspace = true +pastey.workspace = true +rayon = { workspace = true, optional = true } +scan_fmt.workspace = true +tract-data.workspace = true + +[build-dependencies] +cc.workspace = true +half.workspace = true +liquid.workspace = true +liquid-core.workspace = true +liquid-derive.workspace = true +smallvec.workspace = true +unicode-normalization.workspace = true +time.workspace = true +walkdir.workspace = true + +[dev-dependencies] +env_logger.workspace = true +libc.workspace = true +nu-ansi-term.workspace = true +core_affinity.workspace = true + +[target.'cfg(not(target_family = "wasm"))'.dev-dependencies] +criterion.workspace = true +proptest.workspace = true + +[target.'cfg(target_family = "wasm")'.dev-dependencies] +# Wasm doesn't support the `rayon` feature of criterion +criterion = { version = "0.6", default-features = false, features = ["plotters", "cargo_bench_support"] } +# Wasm doesn't support the `fork` feature of proptest. +proptest = { version = "1.0.0", default-features = false, features = ["std", "bit-set"] } + +[features] +# This feature is meant to accomodate very restrictive / legacy toolchains that do +# have support for fp16 instructions, breaking tract compilation. +# It is not meant to be used in other situations, where run-time detection is +# preferred. +no_fp16 = [] +apple-amx-ios = [] +default = [ ] +multithread-mm = [ "rayon" ] +complex = [ "tract-data/complex" ] +hwbench = [ "rayon" ] + +[[bench]] +bench = false +name = "arm64" +harness = false + +[[bench]] +name = "mat_vec" +harness = false + +[[bench]] +name = "mm_for_wavenet_hw" +harness = false + +[[bench]] +name = "mm_for_inception" +harness = false + +[[bench]] +name = "mm_for_asr_am" +harness = false + +[[bench]] +name = "sigmoid" +harness = false + +[[bench]] +name = "softmax" +harness = false + +[[bench]] +bench = false +name = "arm64simd" +harness = false + +[[bench]] +bench = false +name = "arm32neon" +harness = false + +[[bench]] +name = "virtual_im2col" +harness = false + +[[bench]] +bench = false +name = "x86_64" +harness = false + +[[bench]] +bench = false +name = "intel" +harness = false + +[[bench]] +bench = false +name = "leaky_relu" +harness = false diff --git a/vendor/tract-linalg-0.22.1/LICENSE b/vendor/tract-linalg-0.22.1/LICENSE new file mode 100644 index 000000000..09250ca89 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/LICENSE @@ -0,0 +1,12 @@ +## License + +Licensed under either of + * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) + * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) +at your option. + +### Contribution + +Unless you explicitly state otherwise, any contribution intentionally submitted +for inclusion in the work by you, as defined in the Apache-2.0 license, shall +be dual licensed as above, without any additional terms or conditions. diff --git a/vendor/tract-linalg-0.22.1/LICENSE-APACHE b/vendor/tract-linalg-0.22.1/LICENSE-APACHE new file mode 100644 index 000000000..16fe87b06 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/vendor/tract-linalg-0.22.1/LICENSE-MIT b/vendor/tract-linalg-0.22.1/LICENSE-MIT new file mode 100644 index 000000000..31aa79387 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/LICENSE-MIT @@ -0,0 +1,23 @@ +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/vendor/tract-linalg-0.22.1/README.md b/vendor/tract-linalg-0.22.1/README.md new file mode 100644 index 000000000..ba7b722b0 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/README.md @@ -0,0 +1,27 @@ +# tract-linalg + +linalg stands for "linear algebra". This is a misnamer. This crates contains +low-level, architecture dependant optimisations used by tract-core. + +# Functions + +* MatMatMul: Extended matrix*matrix product: + * inspired by Gotoblass and BLIS micro kernel approach + * extended for convolution friendly addressing (fused img2col) + * fused output pipeline (min, max, and a few more simple, fast ops) + * f32*f32 -> f32 (à la sgemm) + * i8*i8 -> i32 accumulator -> i32 storage + * i8*i8 -> i32 accumulator -> i8 (with channel zeropoint and scale, and re-quantization pipeline) +* f32 sigmoid and f32 tanh: at f32 precision, by a rationale function (no exponentiation) +* byte-to-byte lookup table + +# Implementations + +| | generic fallback | armv6, vfp | armv7 neon | armv8 simd | x64 FMA +|-------------------|--------------------|---------------|-------------------|-------------------|----------------- +| MatMatMul f32 | | 4x4 | 8x4 | 8x8 | 16x6 +| MatMatMul i8->i8 | | | 8x4 | | 8x8 +| MatMatMul i8->i32 | | | | | 8x8 +| sigmoid f32 | | | 4n | 4n | +| tanh f32 | | | 4n | 4n | +| byte lookup | | | | | diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_32x1_core.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_32x1_core.tmpl new file mode 100644 index 000000000..d7b572e7b --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_32x1_core.tmpl @@ -0,0 +1,204 @@ +// vim: ft=arm + +// C tile regs +// +// q8[0] +// q8[1] +// q8[2] +// q8[3] +// +// .... +// +// q15[0] +// q15[1] +// q15[2] +// q15[3] + + .arm + .text + .global armv7neon_mmm_f32_32x1_{{core}}_{{suffix}} + .type armv7neon_mmm_f32_32x1_{{core}}_{{suffix}}, %function + +armv7neon_mmm_f32_32x1_{{core}}_{{suffix}}: + + pld [r0] + push { r4-r12 } + vpush { q4-q7 } + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + + cmp r3, #0 + beq .non_linear_loop + + mov r1, r4 // packed A ptr + pld [r3] + pld [r5] + + pld [r1, #128] + pld [r1, #192] + pld [r1, #256] + pld [r1, #320] + pld [r1, #384] + pld [r1, #448] + pld [r1, #512] + +.packed_packed_loop_1: + pld [r5] // packed B ptr + +{% if core == "cortexa7" %} + + vldr d0, [r1] + vldr d1, [r1, #8] + vldr d2, [r1, #16] + vldr d3, [r1, #24] + vldr d4, [r1, #32] + vldr d5, [r1, #40] + vldr d6, [r1, #48] + vldr d7, [r1, #56] + vldr d8, [r1, #64] + vldr d9, [r1, #72] + vldr d10, [r1, #80] + vldr d11, [r1, #88] + vldr s30, [r5] + + pld [r1, #512] + pld [r1, #576] + pld [r5, #64] + + vmla.f32 q8, q0, d15[0] + vmla.f32 q9, q1, d15[0] + + vldr d0, [r1, #96] + vldr d1, [r1, #104] + vldr d2, [r1, #112] + vldr d3, [r1, #120] + + vmla.f32 q10, q2, d15[0] + vmla.f32 q11, q3, d15[0] + + vmla.f32 q12, q4, d15[0] + vmla.f32 q13, q5, d15[0] + + vmla.f32 q14, q0, d15[0] + vmla.f32 q15, q1, d15[0] + + add r1, #128 + add r5, #4 + +{% elsif core == "cortexa9" %} + + vld1.64 {d0-d3}, [r1]! + vld1.64 {d4-d7}, [r1]! + pld [r1, #512] + pld [r1, #576] + vld1.64 {d8-d11}, [r1]! + vld1.f32 d15[0], [r5]! + pld [r5, #64] + + vmla.f32 q8, q0, d15[0] + vmla.f32 q9, q1, d15[0] + vld1.64 {d0-d3}, [r1]! + + vmla.f32 q10, q2, d15[0] + vmla.f32 q11, q3, d15[0] + + vmla.f32 q12, q4, d15[0] + vmla.f32 q13, q5, d15[0] + + vmla.f32 q14, q0, d15[0] + vmla.f32 q15, q1, d15[0] + +{% else %} + + vldmia r1!, { q0-q3 } + vldmia r5!, { s30 } + + vmla.f32 q8, q0, d15[0] + vmla.f32 q9, q1, d15[0] + vldmia r1!, { q0-q1 } + + vmla.f32 q10, q2, d15[0] + vmla.f32 q11, q3, d15[0] + vldmia r1!, { q2-q3 } + + vmla.f32 q12, q0, d15[0] + vmla.f32 q13, q1, d15[0] + + vmla.f32 q14, q2, d15[0] + vmla.f32 q15, q3, d15[0] + +{% endif %} + + subs r3, r3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +{% include "armv7neon_mmm_f32_scalars.tmpliq" from:8, to:15 %} +{% include "armv7neon_mmm_f32_per_rows.tmpliq" mr:32, from:8, to:15 %} +{% include "armv7neon_mmm_f32_per_cols.tmpliq" mr:32, from:8, to:15 %} + +.add_unicast: + {% for reg in (0..15) %} + vld1.f32 d{{reg}}[0], [ r3 ], r4 + vld1.f32 d{{reg}}[1], [ r3 ], r4 + {% endfor %} + {% for reg in (0..7) %} + vadd.f32 q{{reg|plus:8}}, q{{reg|plus:8}}, q{{reg}} + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + vld1.f32 d0[0], [ r4 ] + vldmia r3!, { q4-q7 } + + vmla.f32 q8, q4, d0[0] + vmla.f32 q9, q5, d0[0] + + vmla.f32 q10, q6, d0[0] + vmla.f32 q11, q7, d0[0] + + vldmia r3!, { q4-q7 } + + vmla.f32 q12, q4, d0[0] + vmla.f32 q13, q5, d0[0] + + vmla.f32 q14, q6, d0[0] + vmla.f32 q15, q7, d0[0] + + b .non_linear_loop + +.store: + // r3, r4 <- ptr, rsc + cmp r4, #4 + bne .store_generic + + vst1.f64 {d16-d19}, [r3]! + vst1.f64 {d20-d23}, [r3]! + vst1.f64 {d24-d27}, [r3]! + vst1.f64 {d28-d31}, [r3]! + + b .non_linear_loop + +.store_generic: + + {% for reg in (16..31) %} + vst1.f32 d{{reg}}[0], [r3], r4 + vst1.f32 d{{reg}}[1], [r3], r4 + {% endfor %} + + b .non_linear_loop + +.load_tile: + vldmia r3!, { q8-q15 } + b .non_linear_loop + +.return: + vpop { q4-q7 } + pop { r4-r12 } + + bx lr + diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x1_core.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x1_core.tmpl new file mode 100644 index 000000000..93aa6f295 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x1_core.tmpl @@ -0,0 +1,98 @@ +// vim: ft=arm + + .arm + .text + .global armv7neon_mmm_f32_8x1_{{core}}_{{suffix}} + .type armv7neon_mmm_f32_8x1_{{core}}_{{suffix}}, %function + +armv7neon_mmm_f32_8x1_{{core}}_{{suffix}}: + + pld [r0] + push { r4-r12 } + vpush { q4-q7 } + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + + cmp r3, #0 + beq .non_linear_loop + + mov r1, r4 // packed A ptr + pld [r3] + pld [r5] + + pld [r1, #128] + pld [r1, #192] + pld [r1, #256] + pld [r1, #320] + pld [r1, #384] + pld [r1, #448] + pld [r1, #512] + +.packed_packed_loop_1: + pld [r5] // packed B ptr + + vldmia r1!, { q0-q1 } + vldmia r5!, { s30 } + + vmla.f32 q8, q0, d15[0] + vmla.f32 q9, q1, d15[0] + + subs r3, r3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +{% include "armv7neon_mmm_f32_scalars.tmpliq" from:8, to:9 %} +{% include "armv7neon_mmm_f32_per_rows.tmpliq" mr:8, from:8, to:9 %} +{% include "armv7neon_mmm_f32_per_cols.tmpliq" mr:8, from:8, to:9 %} + +.add_unicast: + {% for reg in (0..15) %} + vld1.f32 d{{reg}}[0], [ r3 ], r4 + vld1.f32 d{{reg}}[1], [ r3 ], r4 + {% endfor %} + {% for reg in (0..7) %} + vadd.f32 q{{reg|plus:8}}, q{{reg|plus:8}}, q{{reg}} + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + vld1.f32 d0[0], [ r4 ] + vldmia r3!, { q4-q5 } + + vmla.f32 q8, q4, d0[0] + vmla.f32 q9, q5, d0[0] + + b .non_linear_loop + +.store: + // r3, r4 <- ptr, rsc + cmp r4, #4 + bne .store_generic + + vst1.f64 {d16-d19}, [r3]! + + b .non_linear_loop + +.store_generic: + + {% for reg in (16..19) %} + vst1.f32 d{{reg}}[0], [r3], r4 + vst1.f32 d{{reg}}[1], [r3], r4 + {% endfor %} + + b .non_linear_loop + +.load_tile: + vldmia r3!, { q8-q15 } + b .non_linear_loop + +.return: + vpop { q4-q7 } + pop { r4-r12 } + + bx lr + diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x4_core.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x4_core.tmpl new file mode 100644 index 000000000..9117e6aa6 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x4_core.tmpl @@ -0,0 +1,143 @@ +// vim: ft=arm + +// C tile regs +// +// q8[0] q10[0] q12[0] q14[0] +// q8[1] q10[1] q12[1] q14[1] +// q8[2] q10[2] q12[2] q14[2] +// q8[3] q10[3] q12[3] q14[3] +// +// q9[0] q11[0] q13[0] q15[0] +// q9[1] q11[1] q13[1] q15[1] +// q9[2] q11[2] q13[2] q15[2] +// q9[3] q11[3] q13[3] q15[3] + +// packed A buffering (2x8 values): alternating q0, q1 with q2, q3 +// packed B buffering (2x4 values): alternating q4 with q5 + + .arm + .text + .global armv7neon_mmm_f32_8x4_{{core}}_{{suffix}} + .type armv7neon_mmm_f32_8x4_{{core}}_{{suffix}}, %function + +armv7neon_mmm_f32_8x4_{{core}}_{{suffix}}: + pld [r0] + push { r4-r12 } + vpush { q4-q7 } + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + + cmp r3, #0 + beq .non_linear_loop + + mov r1, r4 // packed A ptr + pld [r3] + pld [r5] + + .packed_packed: + pld [r5] // packed B ptr + .packed_packed_loop_1: + +{% if core == "cortexa7" %} + vldr d0, [r1] + vldr d1, [r1, #8] + vldr d2, [r1, #16] + vldr d3, [r1, #24] + vldr d4, [r5] + vldr d5, [r5, #8] +{% elsif core == "cortexa9" %} + vld1.64 {d0-d3}, [r1]! + vld1.64 {d4, d5}, [r5]! +{% else %} + vldmia r1!, { q0, q1} + vldmia r5!, { q2 } +{% endif %} + +{% if core != "generic" %} + pld [r1, #512] + pld [r5, #512] +{% endif %} + + vmla.f32 q8, q0, d4[0] + vmla.f32 q9, q1, d4[0] + + vmla.f32 q10, q0, d4[1] + vmla.f32 q11, q1, d4[1] + + vmla.f32 q12, q0, d5[0] + vmla.f32 q13, q1, d5[0] + + vmla.f32 q14, q0, d5[1] + vmla.f32 q15, q1, d5[1] + +{% if core == "cortexa7" %} + add r1, #32 + add r5, #16 +{% endif %} + + subs r3, r3, #1 + bne .packed_packed_loop_1 + b .non_linear_loop + +{% include "armv7neon_mmm_f32_scalars.tmpliq" from:8, to:15 %} +{% include "armv7neon_mmm_f32_per_rows.tmpliq" mr:8, from:8, to:15 %} +{% include "armv7neon_mmm_f32_per_cols.tmpliq" mr:8, from:8, to:15 %} + +.add_unicast: + // r3, r4, r5 <- ptr, rsc, csc + {% for col in (0..3) %} + mov r2, r3 + {% for reg in (0..3) %} + vld1.f32 d0[0], [ r2 ], r4 + vld1.f32 d0[1], [ r2 ], r4 + vadd.f32 d{{col | times: 4 | plus: reg | plus : 16}}, d0 + {% endfor %} + add r3, r3, r5 + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + vldmia r3!, { q0, q1 } + vldmia r4!, { q4 } + + vmla.f32 q8, q0, d8[0] + vmla.f32 q9, q1, d8[0] + + vmla.f32 q10, q0, d8[1] + vmla.f32 q11, q1, d8[1] + + vmla.f32 q12, q0, d9[0] + vmla.f32 q13, q1, d9[0] + + vmla.f32 q14, q0, d9[1] + vmla.f32 q15, q1, d9[1] + + b .non_linear_loop + +.store: + // r3,r4,r5 are c,rsc,csc + {% for col in (0..3) %} + mov r8, r3 + {% for reg in (0..3) %} + vst1.f32 d{{col | times: 4 | plus: reg | plus : 16}}[0], [ r8 ], r4 + vst1.f32 d{{col | times: 4 | plus: reg | plus : 16}}[1], [ r8 ], r4 + {% endfor %} + {% if col < 3 %} + add r3, r3, r5 + {% endif %} + {% endfor %} + b .non_linear_loop + +.load_tile: + vldmia r3!, { q8-q15 } + b .non_linear_loop + +.return: + vpop { q4-q7 } + pop { r4-r12 } + + bx lr + diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x6_core.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x6_core.tmpl new file mode 100644 index 000000000..7baefd69d --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x6_core.tmpl @@ -0,0 +1,158 @@ +// vim: ft=arm + + .arm + .text + .global armv7neon_mmm_f32_8x6_{{core}}_{{suffix}} + .type armv7neon_mmm_f32_8x6_{{core}}_{{suffix}}, %function + +armv7neon_mmm_f32_8x6_{{core}}_{{suffix}}: + + pld [r0] + push { r4-r12 } + vpush { q4-q7 } + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + cmp r3, #0 + beq .non_linear_loop + + mov r1, r4 // packed A ptr + pld [r3] + pld [r5] + + .packed_packed_loop_1: + +{% if core == "cortexa7" %} + vldr d0, [r1] + vldr d1, [r1, #8] + vldr d2, [r1, #16] + vldr d3, [r1, #24] + vldr d4, [r5] + vldr d5, [r5, #8] + vldr d6, [r5, #16] +{% elsif core == "cortexa9" %} + vld1.64 {d0-d3}, [r1]! + vld1.64 {d4, d5, d6}, [r5]! +{% else %} + vldmia r1!, {q0-q1} + vldmia r5!, {d4-d6} +{% endif %} + +{% if core != "generic" %} + pld [r1, #512] + pld [r5, #512] +{% endif %} + + vmla.f32 q4, q0, d4[0] + vmla.f32 q5, q1, d4[0] + + vmla.f32 q6, q0, d4[1] + vmla.f32 q7, q1, d4[1] + + vmla.f32 q8, q0, d5[0] + vmla.f32 q9, q1, d5[0] + + vmla.f32 q10, q0, d5[1] + vmla.f32 q11, q1, d5[1] + + vmla.f32 q12, q0, d6[0] + vmla.f32 q13, q1, d6[0] + + vmla.f32 q14, q0, d6[1] + vmla.f32 q15, q1, d6[1] + +{% if core == "cortexa7" %} + add r1, #32 + add r5, #24 +{% endif %} + + subs r3, r3, #1 + bne .packed_packed_loop_1 + b .non_linear_loop + +{% include "armv7neon_mmm_f32_scalars.tmpliq" from:4, to:15 %} +{% include "armv7neon_mmm_f32_per_rows.tmpliq" mr:8, from:4, to:15 %} +{% include "armv7neon_mmm_f32_per_cols.tmpliq" mr:8, from:4, to:15 %} + +.add_unicast: + // r3, r4, r5, r6 <- ptr, rsc, csc, size + {% for col in (0..5) %} + mov r2, r3 + {% for reg in (0..3) %} + vld1.f32 d0[0], [ r2 ], r4 + vld1.f32 d0[1], [ r2 ], r4 + vadd.f32 d{{col | times: 4 | plus: reg | plus : 8}}, d0 + {% endfor %} + add r3, r3, r5 + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + vldmia r3!, { q0, q1 } + vldmia r4!, { d4, d5, d6 } + + vmla.f32 q4, q0, d4[0] + vmla.f32 q5, q1, d4[0] + + vmla.f32 q6, q0, d4[1] + vmla.f32 q7, q1, d4[1] + + vmla.f32 q8, q0, d5[0] + vmla.f32 q9, q1, d5[0] + + vmla.f32 q10, q0, d5[1] + vmla.f32 q11, q1, d5[1] + + vmla.f32 q12, q0, d6[0] + vmla.f32 q13, q1, d6[0] + + vmla.f32 q14, q0, d6[1] + vmla.f32 q15, q1, d6[1] + + b .non_linear_loop + +.store: + // r3, r4, r5 <- ptr, rsc, csc + + cmp r4, #4 + bne .store_generic + + {% for col in (0..5) %} + mov r8, r3 + {% for reg in (0..3) %} + vst1.64 d{{col| times: 4 | plus: 8 | plus: reg}}, [ r8 ]! + {% endfor %} + {% if col < 5 %} + add r3, r3, r5 + {% endif %} + {% endfor %} + + b .non_linear_loop + +.store_generic: + {% for col in (0..5) %} + mov r8, r3 + {% for reg in (0..3) %} + vst1.f32 d{{col | times: 4 | plus: reg | plus : 8}}[0], [ r8 ], r4 + vst1.f32 d{{col | times: 4 | plus: reg | plus : 8}}[1], [ r8 ], r4 + {% endfor %} + {% if col < 5 %} + add r3, r3, r5 + {% endif %} + {% endfor %} + + b .non_linear_loop + +.load_tile: + vldmia r3!, { q4-q7 } + vldmia r3!, { q8-q15 } + b .non_linear_loop + +.return: + vpop { q4-q7 } + pop { r4-r12 } + + bx lr + diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_cols.tmpliq new file mode 100644 index 000000000..adc9b14ed --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_cols.tmpliq @@ -0,0 +1,9 @@ +// vim: ft=arm + +{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_min", op:"vmin.f32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_max", op:"vmax.f32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_mul", op:"vmul.f32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_add", op:"vadd.f32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_sub", op:"vsub.f32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_sub_flipped", op:"vsub.f32", mr:mr, from:from, to:to, flipped: true%} + diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_rows.tmpliq new file mode 100644 index 000000000..64dd5ca8a --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_rows.tmpliq @@ -0,0 +1,9 @@ +// vim: ft=arm + +{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_min", op:"vmin.f32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_max", op:"vmax.f32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_mul", op:"vmul.f32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_add", op:"vadd.f32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_sub", op:"vsub.f32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_sub_flipped", op:"vsub.f32", mr:mr, from:from, to:to, flipped: true%} + diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_scalars.tmpliq new file mode 100644 index 000000000..352606371 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_scalars.tmpliq @@ -0,0 +1,24 @@ +// vim: ft=arm + +{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_min", op:"vmin.f32", from:from, to:to%} +{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_max", op:"vmax.f32", from:from, to:to%} +{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_mul", op:"vmul.f32", from:from, to:to%} +{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_add", op:"vadd.f32", from:from, to:to%} +{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_sub", op:"vsub.f32", from:from, to:to%} +{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_sub_flipped", op:"vsub.f32", from:from, to:to, flipped:true%} + +.leaky_relu: + vmov s0, r3 + vdup.32 q0, d0[0] + {% for reg in (from..to) %} + vmul.f32 q2, q{{reg}}, q0 + vcgt.f32 q1, q{{reg}}, 0 + vbsl q1, q{{reg}}, q2 + vmov q{{reg}}, q1 + {% endfor %} + b .non_linear_loop + +.q_shl: +.q_shr: +.q_scale: + b .unsupported diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_32x1.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_32x1.tmpl new file mode 100644 index 000000000..e176c48b3 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_32x1.tmpl @@ -0,0 +1,174 @@ +// vim: ft=arm + +// C tile regs: q8..q16 + + .arm + .text + .global armv7neon_mmm_i32_32x1_{{suffix}} + .type armv7neon_mmm_i32_32x1_{{suffix}}, %function + +armv7neon_mmm_i32_32x1_{{suffix}}: + + pld [r0] + push { r4-r12 } + vpush { q4-q7 } + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + // r3 r4 r5 r6 + // k a b packing + cmp r3, #0 + beq .non_linear_loop + + mov r1, r4 // packed A ptr + pld [r3] + pld [r7] + + cmp r6, #1 + beq .packed_packed_i8i8 + + .packed_packed: + + .packed_packed_loop_1: + vldmia r1!, { q4-q7 } + + vld1.32 { d0[0] }, [ r5 ]! + + vmla.s32 q8, q4, d0[0] + + vldmia r1!, { q1-q4 } + + vmla.s32 q9, q5, d0[0] + vmla.s32 q10, q6, d0[0] + vmla.s32 q11, q7, d0[0] + + vmla.s32 q12, q1, d0[0] + vmla.s32 q13, q2, d0[0] + + vmla.s32 q14, q3, d0[0] + vmla.s32 q15, q4, d0[0] + + subs r3, r3, #1 + bne .packed_packed_loop_1 + b .non_linear_loop + + .packed_packed_i8i8: + + .packed_packed_loop_i8i8_1: + vldmia r1!, { q4-q5 } + + vld1.8 { d0[0] }, [ r5 ]! + vmovl.s8 q0, d0 + + vmovl.s8 q1, d8 + vmlal.s16 q8, d2, d0[0] + vmlal.s16 q9, d3, d0[0] + + vmovl.s8 q1, d9 + vmlal.s16 q10, d2, d0[0] + vmlal.s16 q11, d3, d0[0] + + vmovl.s8 q1, d10 + vmlal.s16 q12, d2, d0[0] + vmlal.s16 q13, d3, d0[0] + + vmovl.s8 q1, d11 + vmlal.s16 q14, d2, d0[0] + vmlal.s16 q15, d3, d0[0] + + subs r3, r3, #1 + bne .packed_packed_loop_i8i8_1 + b .non_linear_loop + +{% include "armv7neon_mmm_i32_scalars.tmpliq" from:8, to:15 %} +{% include "armv7neon_mmm_i32_per_rows.tmpliq" mr:32, from:8, to:15 %} +{% include "armv7neon_mmm_i32_per_cols.tmpliq" mr:32, from:8, to:15 %} + +.add_unicast: + // r3, r4, r5, r6 <- ptr, rsc, csc, size + + cmp r6, #4 + beq .non_linear_addc_i32 + + {% for reg in (16..31) %} + vld1.s8 d0[0], [ r3 ], r4 + vld1.s8 d0[1], [ r3 ], r4 + vmovl.s8 q0, d0 + vmovl.s16 q0, d0 + vadd.i32 d{{reg}}, d0 + {% endfor %} + + b .non_linear_loop + +.non_linear_addc_i32: + {% for reg in (16..31) %} + vld1.s32 d0[0], [ r3 ], r4 + vld1.s32 d0[1], [ r3 ], r4 + vadd.i32 d{{reg}}, d0 + {% endfor %} + b .non_linear_loop + +.add_row_col_products: + vldm r4, { s0 } + + vldmia r3!, { q4-q7 } + + vmla.s32 q8, q4, d0[0] + vmla.s32 q9, q5, d0[0] + + vmla.s32 q10, q6, d0[0] + vmla.s32 q11, q7, d0[0] + + vldmia r3!, { q4-q7 } + + vmla.s32 q12, q4, d0[0] + vmla.s32 q13, q5, d0[0] + + vmla.s32 q14, q6, d0[0] + vmla.s32 q15, q7, d0[0] + + b .non_linear_loop + + {% include "armv7neon_mmm_i32_scale_q8_q15.tmpliq" %} + +.store: + // r3, r4, r5, r6 <- ptr, rsc, csc, size + cmp r6, #4 + beq .store_strides_i32 + + {% for reg in (8..15) %} + vmovn.s32 d{{reg | times: 2}}, q{{reg}} + vmovn.s16 d{{reg | times: 2}}, q{{reg}} + {% endfor %} + {% for reg in (8..15) %} + {%capture d%}{{reg | times: 2 }}{%endcapture%} + vst1.s8 d{{d}}[0], [ r3 ], r4 + vst1.s8 d{{d}}[1], [ r3 ], r4 + vst1.s8 d{{d}}[2], [ r3 ], r4 + vst1.s8 d{{d}}[3], [ r3 ], r4 + {% endfor %} + + b .non_linear_loop + +.store_strides_i32: + {% for reg in (8..15) %} + {%capture d%}{{reg | times: 2}}{%endcapture%} + vst1.s32 d{{d}}[0], [ r3 ], r4 + vst1.s32 d{{d}}[1], [ r3 ], r4 + vst1.s32 d{{d|plus:1}}[0], [ r3 ], r4 + vst1.s32 d{{d|plus:1}}[1], [ r3 ], r4 + {% endfor %} + + b .non_linear_loop + +.load_tile: + vldmia r3!, { q8-q15 } + b .non_linear_loop + +.return: + vpop { q4-q7 } + pop { r4-r12 } + + bx lr + diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_8x4.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_8x4.tmpl new file mode 100644 index 000000000..a50f5a10a --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_8x4.tmpl @@ -0,0 +1,294 @@ +// vim: ft=arm + +// C tile regs +// +// q8[0] q10[0] q12[0] q14[0] +// q8[1] q10[1] q12[1] q14[1] +// q8[2] q10[2] q12[2] q14[2] +// q8[3] q10[3] q12[3] q14[3] +// +// q9[0] q11[0] q13[0] q15[0] +// q9[1] q11[1] q13[1] q15[1] +// q9[2] q11[2] q13[2] q15[2] +// q9[3] q11[3] q13[3] q15[3] + + .arm + .text + .global armv7neon_mmm_i32_8x4_{{suffix}} + .type armv7neon_mmm_i32_8x4_{{suffix}}, %function + +armv7neon_mmm_i32_8x4_{{suffix}}: + + pld [r0] + push { r4-r12 } + vpush { q4-q7 } + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + // r3 r4 r5 r6 + // k a b packing + cmp r3, #0 + beq .non_linear_loop + + mov r1, r4 // packed A ptr + pld [r3] + pld [r5] + + cmp r6, #1 + beq .packed_packed_i8i8 + + .packed_packed_loop_1: + + vldmia r1!, { q0, q1 } + vldmia r5!, { q2 } + + vmla.s32 q8, q0, d4[0] + vmla.s32 q9, q1, d4[0] + + vmla.s32 q10, q0, d4[1] + vmla.s32 q11, q1, d4[1] + + vmla.s32 q12, q0, d5[0] + vmla.s32 q13, q1, d5[0] + + vmla.s32 q14, q0, d5[1] + vmla.s32 q15, q1, d5[1] + + subs r3, r3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + + .packed_packed_i8i8: + pld [r5] // packed B ptr + + cmp r3, #4 + blt .packed_packed_loop_i8i8_1 + + .packed_packed_loop_i8i8_4: + pld [r1, #64] + pld [r5, #64] + + // q2: d4 -> d4,d5 A even cols (from r1) + // q3: d6 -> d6,d7 A odd cols (from r1) + // q0: s0 -> d0 : B even lines (from r5) + // q1: s4 -> d2 : B odd lines (from r5) + + // 0 + vldmia r1!, { d4 } + vldmia r5!, { s0 } + + vmovl.s8 q2, d4 + vmovl.s8 q0, d0 + + vmlal.s16 q8, d4, d0[0] + vmlal.s16 q9, d5, d0[0] + + vldmia r1!, { d6 } + + vmlal.s16 q10, d4, d0[1] + vmlal.s16 q11, d5, d0[1] + + vldmia r5!, { s4 } + + vmlal.s16 q12, d4, d0[2] + vmlal.s16 q13, d5, d0[2] + + vmlal.s16 q14, d4, d0[3] + vmlal.s16 q15, d5, d0[3] + + // 1 + vmovl.s8 q3, d6 + vmovl.s8 q1, d2 + + vmlal.s16 q8, d6, d2[0] + vldmia r1!, { d4 } + vmlal.s16 q9, d7, d2[0] + vldmia r5!, { s0 } + + vmlal.s16 q10, d6, d2[1] + vmlal.s16 q11, d7, d2[1] + + vmlal.s16 q12, d6, d2[2] + vmlal.s16 q13, d7, d2[2] + + vmlal.s16 q14, d6, d2[3] + vmlal.s16 q15, d7, d2[3] + + // 2 + vmovl.s8 q2, d4 + vmovl.s8 q0, d0 + + vmlal.s16 q8, d4, d0[0] + vmlal.s16 q9, d5, d0[0] + + vldmia r1!, { d6 } + + vmlal.s16 q10, d4, d0[1] + vmlal.s16 q11, d5, d0[1] + + vldmia r5!, { s4 } + + vmlal.s16 q12, d4, d0[2] + vmlal.s16 q13, d5, d0[2] + + vmlal.s16 q14, d4, d0[3] + vmlal.s16 q15, d5, d0[3] + + // 3 + vmovl.s8 q3, d6 + vmovl.s8 q1, d2 + + vmlal.s16 q8, d6, d2[0] + vmlal.s16 q9, d7, d2[0] + + vmlal.s16 q10, d6, d2[1] + vmlal.s16 q11, d7, d2[1] + + vmlal.s16 q12, d6, d2[2] + vmlal.s16 q13, d7, d2[2] + + vmlal.s16 q14, d6, d2[3] + vmlal.s16 q15, d7, d2[3] + + sub r3, r3, #4 + cmp r3, #4 + bge .packed_packed_loop_i8i8_4 + + cmp r3, #0 + beq .non_linear_loop + + .packed_packed_loop_i8i8_1: + + vldmia r1!, { s0, s1 } + vmovl.s8 q0, d0 + vldmia r5!, { s4 } + vmovl.s8 q1, d2 + + vmlal.s16 q8, d0, d2[0] + vmlal.s16 q9, d1, d2[0] + + vmlal.s16 q10, d0, d2[1] + vmlal.s16 q11, d1, d2[1] + + vmlal.s16 q12, d0, d2[2] + vmlal.s16 q13, d1, d2[2] + + vmlal.s16 q14, d0, d2[3] + vmlal.s16 q15, d1, d2[3] + + subs r3, r3, #1 + bne .packed_packed_loop_i8i8_1 + b .non_linear_loop + +{% include "armv7neon_mmm_i32_scalars.tmpliq" from:8, to:15 %} +{% include "armv7neon_mmm_i32_per_rows.tmpliq" mr:8, from:8, to:15 %} +{% include "armv7neon_mmm_i32_per_cols.tmpliq" mr:8, from:8, to:15 %} + +.add_unicast: + // r3, r4, r5, r6 <- ptr, rsc, csc, size + cmp r6, #4 + beq .non_linear_addc_i32 + + {% for col in (0..3) %} + mov r8, r3 + {% for reg in (0..3) %} + vld1.s8 d0[0], [ r8 ], r4 + vld1.s8 d0[1], [ r8 ], r4 + vmovl.s8 q0, d0 + vmovl.s16 q0, d0 + vadd.i32 d{{col | times: 4 | plus: reg | plus : 16}}, d0 + {% endfor %} + add r3, r3, r5 + {% endfor %} + + b .non_linear_loop + +.non_linear_addc_i32: + + {% for col in (0..3) %} + mov r8, r3 + {% for reg in (0..3) %} + vld1.s32 d0[0], [ r8 ], r4 + vld1.s32 d0[1], [ r8 ], r4 + vadd.i32 d{{col | times: 4 | plus: reg | plus : 16}}, d0 + {% endfor %} + {% if col < 3 %} + add r3, r3, r5 + {% endif %} + {% endfor %} + +b .non_linear_loop + +.add_row_col_products: + vldmia r3!, { q0, q1 } + vldmia r4!, { q4 } + + vmla.s32 q8, q0, d8[0] + vmla.s32 q9, q1, d8[0] + + vmla.s32 q10, q0, d8[1] + vmla.s32 q11, q1, d8[1] + + vmla.s32 q12, q0, d9[0] + vmla.s32 q13, q1, d9[0] + + vmla.s32 q14, q0, d9[1] + vmla.s32 q15, q1, d9[1] + + b .non_linear_loop + + {% include "armv7neon_mmm_i32_scale_q8_q15.tmpliq" %} + +.store: + // r3, r4, r5, r6 <- ptr, rsc, csc, size + cmp r6, #4 + beq .store_strides_i32 + + {% for reg in (8..15) %} + vmovn.s32 d{{reg | times: 2}}, q{{reg}} + vmovn.s16 d{{reg | times: 2}}, q{{reg}} + {% endfor %} + {% for col in (0..3) %} + mov r8, r3 + {% for reg in (0..1) %} + {%capture d%}{{col | times: 2 | plus: reg | times: 2 | plus: 16}}{%endcapture%} + vst1.s8 d{{d}}[0], [ r8 ], r4 + vst1.s8 d{{d}}[1], [ r8 ], r4 + vst1.s8 d{{d}}[2], [ r8 ], r4 + vst1.s8 d{{d}}[3], [ r8 ], r4 + {% endfor %} + {% if col < 3 %} + add r3, r3, r5 + {% endif %} + {% endfor %} + + b .non_linear_loop + +.store_strides_i32: + + {% for col in (0..3) %} + mov r8, r3 + {% for reg in (0..3) %} + {% for lane in (0..1) %} + vst1.s32 d{{col | times: 4 | plus: reg | plus: 16}}[{{lane}}], [ r8 ], r4 + {% endfor %} + {% endfor %} + {% if col < 3 %} + add r3, r3, r5 + {% endif %} + {% endfor %} + + b .non_linear_loop + +.load_tile: + vldmia r3!, { q8-q15 } + b .non_linear_loop + +.return: + vpop { q4-q7 } + pop { r4-r12 } + + bx lr + diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_cols.tmpliq new file mode 100644 index 000000000..3beef2095 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_cols.tmpliq @@ -0,0 +1,8 @@ +// vim: ft=arm + +{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_min", op:"vmin.s32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_max", op:"vmax.s32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_mul", op:"vmul.s32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_add", op:"vadd.s32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_sub", op:"vsub.s32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_sub_flipped", op:"vsub.s32", mr:mr, from:from, to:to, flipped:true%} diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_rows.tmpliq new file mode 100644 index 000000000..f0739b31c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_rows.tmpliq @@ -0,0 +1,8 @@ +// vim: ft=arm + +{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_min", op:"vmin.s32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_max", op:"vmax.s32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_mul", op:"vmul.s32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_add", op:"vadd.s32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_sub", op:"vsub.s32", mr:mr, from:from, to:to %} +{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_sub_flipped", op:"vsub.s32", mr:mr, from:from, to:to, flipped:true%} diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scalars.tmpliq new file mode 100644 index 000000000..7c3053d5a --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scalars.tmpliq @@ -0,0 +1,20 @@ +// vim: ft=arm + +{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_min", op:"vmin.s32", from:from, to:to%} +{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_max", op:"vmax.s32", from:from, to:to%} +{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_mul", op:"vmul.s32", from:from, to:to%} +{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_add", op:"vadd.s32", from:from, to:to%} +{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_sub", op:"vsub.s32", from:from, to:to%} +{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_sub_flipped", op:"vsub.s32", from:from, to:to, flipped:true%} + +.leaky_relu: + vmov s0, r3 + vdup.32 q0, d0[0] + {% for reg in (from..to) %} + vmul.s32 q2, q{{reg}}, q0 + vcgt.s32 q1, q{{reg}}, 0 + vbsl q1, q{{reg}}, q2 + vmov q{{reg}}, q1 + {% endfor %} + b .non_linear_loop + diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scale_q8_q15.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scale_q8_q15.tmpliq new file mode 100644 index 000000000..f594928ae --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scale_q8_q15.tmpliq @@ -0,0 +1,232 @@ +// vim: ft=arm + +.q_scale: + ldm r0, { r4, r5, r6, r7 } // fixme params are already loaded by disp. + vdup.s32 q0, r7 // q0 <- multiplier + + mov r3, #1 + vdup.s32 q1, r3 // q1 <- ones + vmovl.s32 q1, d2 + + add r5, #32 + neg r5, r5 + vdup.s32 q2, r5 // q2 <- -(shift + 32) + vmovl.s32 q2, d4 + + cmp r6, #1 + beq .q_scale_rounding_zero + cmp r6, #2 + beq .q_scale_rounding_away + cmp r6, #3 + beq .q_scale_rounding_minus_inf + cmp r6, #4 + beq .q_scale_rounding_plus_inf + cmp r6, #5 + beq .q_scale_rounding_even + cmp r6, #6 + beq .q_scale_rounding_odd + + b .unsupported + +.q_scale_rounding_zero: + {% for q in (8..15) %} + vclt.s32 q7, q{{q}}, #0 + vabs.s32 q{{q}}, q{{q}} + vqdmull.s32 q5, d{{q | times:2}}, d0[0] + vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0] + vsub.s64 q5, q1 + vsub.s64 q6, q1 + vqrshl.s64 q5, q2 + vqrshl.s64 q6, q2 + vmovn.s64 d{{q | times:2}}, q5 + vmovn.s64 d{{q | times:2 | plus: 1}}, q6 + vneg.s32 q5, q{{q}} + vbit.s32 q{{q}}, q5, q7 + {% endfor %} + + b .non_linear_loop + +.q_scale_rounding_away: + {% for q in (8..15) %} + vclt.s32 q7, q{{q}}, #0 + vabs.s32 q{{q}}, q{{q}} + vqdmull.s32 q5, d{{q | times:2}}, d0[0] + vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0] + vqrshl.s64 q5, q2 + vqrshl.s64 q6, q2 + vmovn.s64 d{{q | times:2}}, q5 + vmovn.s64 d{{q | times:2 | plus: 1}}, q6 + vneg.s32 q5, q{{q}} + vbit.s32 q{{q}}, q5, q7 + {% endfor %} + + b .non_linear_loop + +.q_scale_rounding_minus_inf: + {% for q in (8..15) %} + vqdmull.s32 q5, d{{q | times:2}}, d0[0] + vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0] + vsub.s64 q5, q1 + vsub.s64 q6, q1 + vqrshl.s64 q5, q2 + vqrshl.s64 q6, q2 + vmovn.s64 d{{q | times:2}}, q5 + vmovn.s64 d{{q | times:2 | plus: 1}}, q6 + {% endfor %} + + b .non_linear_loop + +.q_scale_rounding_plus_inf: + {% for q in (8..15) %} + vqdmull.s32 q5, d{{q | times:2}}, d0[0] + vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0] + vqrshl.s64 q5, q2 + vqrshl.s64 q6, q2 + vmovn.s64 d{{q | times:2}}, q5 + vmovn.s64 d{{q | times:2 | plus: 1}}, q6 + {% endfor %} + + b .non_linear_loop + +.q_scale_rounding_even: + {% for q in (8..15) %} + vclt.s32 q7, q{{q}}, #0 + vabs.s32 q{{q}}, q{{q}} + vqdmull.s32 q5, d{{q | times:2}}, d0[0] + vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0] + vqshl.s64 q3, q5, q2 + vqshl.s64 q4, q6, q2 + vand q3, q3, q1 + vand q4, q4, q1 + vsub.s64 q3, q3, q1 + vsub.s64 q4, q4, q1 + vadd.s64 q5, q3 + vadd.s64 q6, q4 + vqrshl.s64 q5, q2 + vqrshl.s64 q6, q2 + vmovn.s64 d{{q | times:2}}, q5 + vmovn.s64 d{{q | times:2 | plus: 1}}, q6 + vneg.s32 q5, q{{q}} + vbit.s32 q{{q}}, q5, q7 + {% endfor %} + + b .non_linear_loop + +.q_scale_rounding_odd: + {% for q in (8..15) %} + vclt.s32 q7, q{{q}}, #0 + vabs.s32 q{{q}}, q{{q}} + vqdmull.s32 q5, d{{q | times:2}}, d0[0] + vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0] + vqshl.s64 q3, q5, q2 + vqshl.s64 q4, q6, q2 + vand q3, q3, q1 + vand q4, q4, q1 + vsub.s64 q5, q3 + vsub.s64 q6, q4 + vqrshl.s64 q5, q2 + vqrshl.s64 q6, q2 + vmovn.s64 d{{q | times:2}}, q5 + vmovn.s64 d{{q | times:2 | plus: 1}}, q6 + vneg.s32 q5, q{{q}} + vbit.s32 q{{q}}, q5, q7 + {% endfor %} + + b .non_linear_loop + +.q_shl: + ldm r0, { r4, r5 } // fixme params are already loaded by disp. + vdup.s32 q2, r5 // q2 <- shift + + {% for q in (8..15) %} + vqrshl.s32 q{{q}}, q2 // Shift + {% endfor %} + + b .non_linear_loop + +.q_shr: + ldm r0, { r4, r5, r6 } // fixme params are already loaded by disp. + + mov r3, #1 + vdup.s32 q1, r3 // q1 <- ones + + neg r5, r5 + vdup.s32 q2, r5 // q2 <- shift + + cmp r6, #1 + beq .q_shr_rounding_zero + cmp r6, #2 + beq .q_shr_rounding_away + cmp r6, #3 + beq .q_shr_rounding_minus_inf + cmp r6, #4 + beq .q_shr_rounding_plus_inf + cmp r6, #5 + beq .q_shr_rounding_even + cmp r6, #6 + beq .q_shr_rounding_odd + + b .unsupported + +.q_shr_rounding_zero: + // return signum(x) * ((abs(x) - 1) >>r shift ) + {% for q in (8..15) %} + vclt.s32 q3, q{{q}}, #0 // Store the sign of the value + vabs.s32 q{{q}}, q{{q}} // Compute their abs + vsub.s32 q{{q}}, q1 // Substract 1 to abs(x) + vqrshl.s32 q{{q}}, q2 // Rounding shift (0.5 -> 1) + vneg.s32 q4, q{{q}} // Compute -((abs(x) - 1) >>r shift ) + vbit.s32 q{{q}}, q4, q3 // Restore sign of x with bit mask + {% endfor %} + b .non_linear_loop + +.q_shr_rounding_away: + // return signum(x) * (abs(x) >>r shift ) + {% for q in (8..15) %} + vclt.s32 q3, q{{q}}, #0 // Store the sign of the value + vabs.s32 q{{q}}, q{{q}} // Compute their abs + vqrshl.s32 q{{q}}, q2 // Rounding shift (0.5 -> 1) + vneg.s32 q4, q{{q}} // Compute -(abs(x) >>r shift ) + vbit.s32 q{{q}}, q4, q3 // Restore sign of x with bit mask + {% endfor %} + b .non_linear_loop + +.q_shr_rounding_minus_inf: + // return -(-x >>r shift) + {% for q in (8..15) %} + vneg.s32 q3, q{{q}} // Compute -x + vqrshl.s32 q3, q2 // Rounding shift (0.5 -> 1) + vneg.s32 q{{q}}, q3 // Compute -(-x >>r shift) + {% endfor %} + b .non_linear_loop + +.q_shr_rounding_plus_inf: + // return x >>r shift + {% for q in (8..15) %} + vqrshl.s32 q{{q}}, q2 // Rounding shift (0.5 -> 1) + {% endfor %} + b .non_linear_loop + +.q_shr_rounding_even: + // If (x >> shift) is odd -> (x - 0) >>r shift + // If (x >> shift) is even -> (x - 1) >>r shift + {% for q in (8..15) %} + vqshl.s32 q3, q{{q}}, q2 // Truncate shift (0.5 -> 0) + vand.s32 q4, q3, q1 // Store if x is odd + vsub.s32 q5, q4, q1 // If (x >> shift) is odd 0 else -1 + vadd.s32 q{{q}}, q{{q}}, q5 // If (x >> shift) is odd (x - 0) else (x - 1) + vqrshl.s32 q{{q}}, q2 // Rounding shift (0.5 -> 1) + {% endfor %} + b .non_linear_loop + +.q_shr_rounding_odd: + // If (x >> shift) is even -> (x - 0) >>r shift + // If (x >> shift) is odd -> (x - 1) >>r shift + {% for q in (8..15) %} + vqshl.s32 q3, q{{q}}, q2 // Truncate shift (0.5 -> 0) + vand.s32 q4, q3, q1 // Store if x >> shift is odd + vneg.s32 q5, q4 // If x is odd -1 else 0 + vadd.s32 q{{q}}, q{{q}}, q5 // If x is odd (x - 1) else (x - 0) + vqrshl.s32 q{{q}}, q2 // Rounding shift (0.5 -> 1) + {% endfor %} + b .non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_col.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_col.tmpliq new file mode 100644 index 000000000..769d290cf --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_col.tmpliq @@ -0,0 +1,33 @@ +// vim: ft=arm + +.{{label}}: + +{% capture mr_over_4 %}{{ mr | divided_by: 4}}{%endcapture%} +{% capture mr_over_4_min_1 %}{{ mr | divided_by: 4 | minus: 1}}{%endcapture%} + +{%capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_4}}{%endcapture%} +{%capture cols_min_1%}{{to | plus: 1| minus:from| divided_by:mr_over_4|minus:1}}{%endcapture%} + +{% if cols == "1" %} + vld1.f32 d0[0], [ r3 ] +{% else %} + {%capture cols_over_2_minus_1%}{{cols | divided_by:2 | minus:1}}{%endcapture%} + {% for c in (0..cols_over_2_minus_1) %} + vldmia r3!, { d{{c}} } + {% endfor %} +{% endif %} + + +{% for right in (0..cols_min_1) %} + vdup.f32 q3, d{{right|divided_by:2}}[{{right| modulo:2}}] + {% for down in (0..mr_over_4_min_1) %} + {%capture acc%}{{mr_over_4|times:right|plus:from|plus:down}}{%endcapture%} + {% if flipped %} + {{op}} q{{acc}}, q{{acc}}, q3 + {% else %} + {{op}} q{{acc}}, q3, q{{acc}} + {% endif %} + {% endfor %} +{% endfor %} + + b .non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_row.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_row.tmpliq new file mode 100644 index 000000000..a0f2d40c6 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_row.tmpliq @@ -0,0 +1,24 @@ +// vim: ft=arm + +.{{label}}: + +{% capture mr_over_4 %}{{ mr | divided_by: 4}}{%endcapture%} +{% capture mr_over_4_min_1 %}{{ mr | divided_by: 4 | minus: 1}}{%endcapture%} + +{% for reg in (0..mr_over_4_min_1) %} + vldmia r3!, { q{{reg}} } +{% endfor %} + +{% if flipped %} + {% for acc in (from..to) %} + {% capture other%}{{acc | minus: from | modulo: mr_over_4}}{%endcapture%} + {{op}} q{{acc}}, q{{acc}}, q{{other}} + {% endfor %} +{% else %} + {% for acc in (from..to) %} + {% capture other%}{{acc | minus: from | modulo: mr_over_4}}{%endcapture%} + {{op}} q{{acc}}, q{{other}}, q{{acc}} + {% endfor %} +{% endif %} + +b .non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_scalar.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_scalar.tmpliq new file mode 100644 index 000000000..4f135b415 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_scalar.tmpliq @@ -0,0 +1,15 @@ +// vim: ft=arm + +.{{label}}: + vmov s0, r3 + vdup.32 q0, d0[0] + {% if flipped %} + {% for reg in (from..to) %} + {{op}} q{{reg}}, q{{reg}}, q0 + {% endfor %} + {% else %} + {% for reg in (from..to) %} + {{op}} q{{reg}}, q0, q{{reg}} + {% endfor %} + {% endif %} + b .non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_prefetch.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_prefetch.tmpl new file mode 100644 index 000000000..d153e66b8 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_prefetch.tmpl @@ -0,0 +1,22 @@ +// vim: ft=arm + +.arm +.text +.global armv7neon_prefetch_{{suffix}} +.type armv7neon_prefetch_{{suffix}}, %function + +armv7neon_prefetch_{{suffix}}: +loop: + pld [r0] + pld [r0, #32] + pld [r0, #64] + pld [r0, #96] + pld [r0, #128] + pld [r0, #160] + pld [r0, #192] + pld [r0, #224] + add r0, r0, #256 + cmp r0, r1 + blt loop + + bx lr diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_sigmoid_f32_4n.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_sigmoid_f32_4n.tmpl new file mode 100644 index 000000000..baa5072ae --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_sigmoid_f32_4n.tmpl @@ -0,0 +1,215 @@ +// vim: ft=arm + + .arm + .text + .global armv7neon_sigmoid_f32_4n_{{suffix}} + .type armv7neon_sigmoid_f32_4n_{{suffix}}, %function + +/* + s16–s31 (d8–d15, q4–q7) must be preserved + s0–s15 (d0–d7, q0–q3) and d16–d31 (q8–q15) do not need to be preserved +*/ + +armv7neon_sigmoid_f32_4n_{{suffix}}: + cmp r1, #0 + blxeq lr + + vpush { q4-q7 } + + adr r2, .coeffs_num + vldmia r2!, { s0-s13 } + +// q4 -> q4,5,6 +// q5 -> q7,8,9 +// q6 -> q10,11,12 +// q7 -> q13,14,15 + + + cmp r1, #12 + blt .loop + +.loop_3: + vldmia r0, { q4, q5, q6 } // q4 <- x + + vdup.32 q15, d0[0] + vmax.f32 q4, q15 + vmax.f32 q5, q15 + vmax.f32 q6, q15 + vdup.32 q15, d0[1] + vmin.f32 q4, q15 + vmin.f32 q5, q15 + vmin.f32 q6, q15 + + vmul.f32 q7, q4, q4 // q7 <- x2 + vmul.f32 q8, q5, q5 + vmul.f32 q9, q6, q6 + + vdup.32 q10, d1[0] + vdup.32 q11, d1[0] + vdup.32 q12, d1[0] + vdup.32 q13, d1[1] + vdup.32 q14, d1[1] + vdup.32 q15, d1[1] + vmla.f32 q13, q7, q10 + vmla.f32 q14, q8, q11 + vmla.f32 q15, q9, q12 + vdup.32 q10, d2[0] + vdup.32 q11, d2[0] + vdup.32 q12, d2[0] + vmla.f32 q10, q13, q7 + vmla.f32 q11, q14, q8 + vmla.f32 q12, q15, q9 + vdup.32 q13, d2[1] + vdup.32 q14, d2[1] + vdup.32 q15, d2[1] + vmla.f32 q13, q7, q10 + vmla.f32 q14, q8, q11 + vmla.f32 q15, q9, q12 + vdup.32 q10, d3[0] + vdup.32 q11, d3[0] + vdup.32 q12, d3[0] + vmla.f32 q10, q13, q7 + vmla.f32 q11, q14, q8 + vmla.f32 q12, q15, q9 + vdup.32 q13, d3[1] + vdup.32 q14, d3[1] + vdup.32 q15, d3[1] + vmla.f32 q13, q7, q10 + vmla.f32 q14, q8, q11 + vmla.f32 q15, q9, q12 + vdup.32 q10, d4[0] + vdup.32 q11, d4[0] + vdup.32 q12, d4[0] + vmla.f32 q10, q13, q7 + vmla.f32 q11, q14, q8 + vmla.f32 q12, q15, q9 + vmul.f32 q4, q4, q10 // q4 <- numerator + vmul.f32 q5, q5, q11 + vmul.f32 q6, q6, q12 + + vdup.32 q10, d4[1] + vdup.32 q11, d4[1] + vdup.32 q12, d4[1] + vdup.32 q13, d5[0] + vdup.32 q14, d5[0] + vdup.32 q15, d5[0] + vmla.f32 q13, q7, q10 + vmla.f32 q14, q8, q11 + vmla.f32 q15, q9, q12 + vdup.32 q10, d5[1] + vdup.32 q11, d5[1] + vdup.32 q12, d5[1] + vmla.f32 q10, q13, q7 + vmla.f32 q11, q14, q8 + vmla.f32 q12, q15, q9 + vdup.32 q13, d6[0] + vdup.32 q14, d6[0] + vdup.32 q15, d6[0] + vmla.f32 q13, q7, q10 // q13 <- denum + vmla.f32 q14, q8, q11 + vmla.f32 q15, q9, q12 + + vrecpe.f32 q7, q13 + vrecpe.f32 q8, q14 + vrecpe.f32 q9, q15 + vrecps.f32 q10, q7, q13 + vrecps.f32 q11, q8, q14 + vrecps.f32 q12, q9, q15 + vmul.f32 q7, q7, q10 + vmul.f32 q8, q8, q11 + vmul.f32 q9, q9, q12 + vrecps.f32 q10, q7, q13 + vrecps.f32 q11, q8, q14 + vrecps.f32 q12, q9, q15 + vmul.f32 q7, q7, q10 // q7 <- 1/q13 + vmul.f32 q8, q8, q11 + vmul.f32 q9, q9, q12 + + vdup.32 q10, d6[1] + vdup.32 q11, d6[1] + vdup.32 q12, d6[1] + vmla.f32 q10, q4, q7 + vmla.f32 q11, q5, q8 + vmla.f32 q12, q6, q9 + + vstmia r0!, { q10, q11, q12 } + + subs r1, #12 + cmp r1, #12 + bge .loop_3 + + cmp r1, #0; + beq .return + +.loop: + vldmia r0, { q4 } // q4 <- x + + vdup.32 q15, d0[0] + vmax.f32 q4, q15 + vdup.32 q15, d0[1] + vmin.f32 q4, q15 + + vmul.f32 q7, q4, q4 // q7 <- x2 + + vdup.32 q10, d1[0] + vdup.32 q13, d1[1] + vmla.f32 q13, q7, q10 + vdup.32 q10, d2[0] + vmla.f32 q10, q13, q7 + vdup.32 q13, d2[1] + vmla.f32 q13, q7, q10 + vdup.32 q10, d3[0] + vmla.f32 q10, q13, q7 + vdup.32 q13, d3[1] + vmla.f32 q13, q7, q10 + vdup.32 q10, d4[0] + vmla.f32 q10, q13, q7 + vmul.f32 q4, q4, q10 // q4 <- numerator + + vdup.32 q10, d4[1] + vdup.32 q13, d5[0] + vmla.f32 q13, q7, q10 + vdup.32 q10, d5[1] + vmla.f32 q10, q13, q7 + vdup.32 q13, d6[0] + vmla.f32 q13, q7, q10 // q13 <- denum + + vrecpe.f32 q7, q13 + vrecps.f32 q10, q7, q13 + vmul.f32 q7, q7, q10 + vrecps.f32 q10, q7, q13 + vmul.f32 q7, q7, q10 // q7 <- 1/q13 + + vdup.32 q10, d6[1] + vmla.f32 q10, q4, q7 + + vstmia r0!, { q10 } + + subs r1, #4; + bne .loop + +.return: + vpop { q4-q7 } + bx lr + +.coeffs_num: + .float -18.6 // low + .float 18.6 // high + .float -4.433153405e-18 // alpha_13 + .float 1.169974371e-14 + + .float -1.875289645e-11 + .float 4.257889523e-8 + .float 0.00004811817576 + .float 0.008163842030 + + .float 0.2499999971 + .float 3.922935744e-6 // beta_6 + .float 0.001524872358 + .float 0.1159886749 + + .float 1.0 + .float 0.5 // + .float 0.0 // padding + .float 0.0 + diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_tanh_f32_4n.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_tanh_f32_4n.tmpl new file mode 100644 index 000000000..5165f6fb9 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_tanh_f32_4n.tmpl @@ -0,0 +1,209 @@ +// vim: ft=arm + + .arm + .text + .global armv7neon_tanh_f32_4n_{{suffix}} + .type armv7neon_tanh_f32_4n_{{suffix}}, %function + +/* + s16–s31 (d8–d15, q4–q7) must be preserved + s0–s15 (d0–d7, q0–q3) and d16–d31 (q8–q15) do not need to be preserved +*/ + +armv7neon_tanh_f32_4n_{{suffix}}: + cmp r1, #0 + blxeq lr + + vpush { q4-q7 } + + adr r2, .coeffs_num + vldmia r2!, { s0-s13 } + +// q4 -> q4,5,6 +// q5 -> q7,8,9 +// q6 -> q10,11,12 +// q7 -> q13,14,15 + + cmp r1, #12 + blt .loop + +.loop_3: + vldmia r0, { q4, q5, q6 } // q4 <- x + + vdup.32 q15, d0[0] + vmax.f32 q4, q15 + vmax.f32 q5, q15 + vmax.f32 q6, q15 + vdup.32 q15, d0[1] + vmin.f32 q4, q15 + vmin.f32 q5, q15 + vmin.f32 q6, q15 + + vmul.f32 q7, q4, q4 // q7 <- x2 + vmul.f32 q8, q5, q5 + vmul.f32 q9, q6, q6 + + vdup.32 q10, d1[0] + vdup.32 q11, d1[0] + vdup.32 q12, d1[0] + vdup.32 q13, d1[1] + vdup.32 q14, d1[1] + vdup.32 q15, d1[1] + vmla.f32 q13, q7, q10 + vmla.f32 q14, q8, q11 + vmla.f32 q15, q9, q12 + vdup.32 q10, d2[0] + vdup.32 q11, d2[0] + vdup.32 q12, d2[0] + vmla.f32 q10, q13, q7 + vmla.f32 q11, q14, q8 + vmla.f32 q12, q15, q9 + vdup.32 q13, d2[1] + vdup.32 q14, d2[1] + vdup.32 q15, d2[1] + vmla.f32 q13, q7, q10 + vmla.f32 q14, q8, q11 + vmla.f32 q15, q9, q12 + vdup.32 q10, d3[0] + vdup.32 q11, d3[0] + vdup.32 q12, d3[0] + vmla.f32 q10, q13, q7 + vmla.f32 q11, q14, q8 + vmla.f32 q12, q15, q9 + vdup.32 q13, d3[1] + vdup.32 q14, d3[1] + vdup.32 q15, d3[1] + vmla.f32 q13, q7, q10 + vmla.f32 q14, q8, q11 + vmla.f32 q15, q9, q12 + vdup.32 q10, d4[0] + vdup.32 q11, d4[0] + vdup.32 q12, d4[0] + vmla.f32 q10, q13, q7 + vmla.f32 q11, q14, q8 + vmla.f32 q12, q15, q9 + vmul.f32 q4, q4, q10 // q4 <- numerator + vmul.f32 q5, q5, q11 + vmul.f32 q6, q6, q12 + + vdup.32 q10, d4[1] + vdup.32 q11, d4[1] + vdup.32 q12, d4[1] + vdup.32 q13, d5[0] + vdup.32 q14, d5[0] + vdup.32 q15, d5[0] + vmla.f32 q13, q7, q10 + vmla.f32 q14, q8, q11 + vmla.f32 q15, q9, q12 + vdup.32 q10, d5[1] + vdup.32 q11, d5[1] + vdup.32 q12, d5[1] + vmla.f32 q10, q13, q7 + vmla.f32 q11, q14, q8 + vmla.f32 q12, q15, q9 + vdup.32 q13, d6[0] + vdup.32 q14, d6[0] + vdup.32 q15, d6[0] + vmla.f32 q13, q7, q10 // q13 <- denum + vmla.f32 q14, q8, q11 + vmla.f32 q15, q9, q12 + + vrecpe.f32 q7, q13 + vrecpe.f32 q8, q14 + vrecpe.f32 q9, q15 + vrecps.f32 q10, q7, q13 + vrecps.f32 q11, q8, q14 + vrecps.f32 q12, q9, q15 + vmul.f32 q7, q7, q10 + vmul.f32 q8, q8, q11 + vmul.f32 q9, q9, q12 + vrecps.f32 q10, q7, q13 + vrecps.f32 q11, q8, q14 + vrecps.f32 q12, q9, q15 + vmul.f32 q7, q7, q10 // q7 <- 1/q13 + vmul.f32 q8, q8, q11 + vmul.f32 q9, q9, q12 + + vmul.f32 q10, q4, q7 + vmul.f32 q11, q5, q8 + vmul.f32 q12, q6, q9 + + vstmia r0!, { q10, q11, q12 } + + subs r1, #12 + cmp r1, #12 + bge .loop_3 + + cmp r1, #0; + beq .return + +.loop: + vldmia r0, { q4 } // q4 <- x + + vdup.32 q15, d0[0] + vmax.f32 q4, q15 + vdup.32 q15, d0[1] + vmin.f32 q4, q15 + + vmul.f32 q7, q4, q4 // q7 <- x2 + + vdup.32 q10, d1[0] + vdup.32 q13, d1[1] + vmla.f32 q13, q7, q10 + vdup.32 q10, d2[0] + vmla.f32 q10, q13, q7 + vdup.32 q13, d2[1] + vmla.f32 q13, q7, q10 + vdup.32 q10, d3[0] + vmla.f32 q10, q13, q7 + vdup.32 q13, d3[1] + vmla.f32 q13, q7, q10 + vdup.32 q10, d4[0] + vmla.f32 q10, q13, q7 + vmul.f32 q4, q4, q10 // q4 <- numerator + + vdup.32 q10, d4[1] + vdup.32 q13, d5[0] + vmla.f32 q13, q7, q10 + vdup.32 q10, d5[1] + vmla.f32 q10, q13, q7 + vdup.32 q13, d6[0] + vmla.f32 q13, q7, q10 // q13 <- denum + + vrecpe.f32 q7, q13 + vrecps.f32 q10, q7, q13 + vmul.f32 q7, q7, q10 + vrecps.f32 q10, q7, q13 + vmul.f32 q7, q7, q10 // q7 <- 1/q13 + + vmul.f32 q10, q4, q7 + + vstmia r0!, { q10 } + + subs r1, #4; + bne .loop + +.return: + vpop { q4-q7 } + bx lr + +.coeffs_num: + .float -8.9 // low + .float 8.9 // high + .float -8.488492677e-14 // alpha_13 + .float 5.277853000e-11 + + .float -2.022500419e-8 + .float 0.00001115424833 + .float 0.003103950131 + .float 0.1308400453 + + .float 0.9999999934 + .float 0.0002546136580 // beta_6 + .float 0.02449515379 + .float 0.4641733162 + + .float 1.0 + .float 0 // padding + .float 0 // padding + .float 0 // padding diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/dispatcher.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/dispatcher.tmpliq new file mode 100644 index 000000000..2c5e910f5 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/dispatcher.tmpliq @@ -0,0 +1,38 @@ +// vim: ft=arm + +.non_linear: + +.non_linear_loop_entry: + sub r0, #20 + +.non_linear_loop: + add r0, #20 + ldm r0, { r2, r3, r4, r5, r6 } + + cmp r2, #{{ jump_table | size }} + movgt r2, #{{ jump_table | size }} + cmp r2, #0 + movlt r2, #{{ jump_table | size }} + + add pc, pc, r2, LSL#2 + nop // pc in Rn above is start of the add instruction + 8, hence a nop is needed + // This is A32 asm, for T32/Thump2 use nop.w and b.w to avoid problems. +{% for j in jump_table %} + b .{{j}} +{% endfor %} + b .unsupported + + +.unsupported: + mov r0, #1 + b .return + +.done: + mov r0, #0 + b .return + +.clear: +{% for r in (4..15) %} + veor q{{r}}, q{{r}}, q{{r}} +{% endfor %} + b .non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/arm32/armvfpv2/armvfpv2_mmm_f32_4x4.tmpl b/vendor/tract-linalg-0.22.1/arm32/armvfpv2/armvfpv2_mmm_f32_4x4.tmpl new file mode 100644 index 000000000..23fbcc2d1 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armvfpv2/armvfpv2_mmm_f32_4x4.tmpl @@ -0,0 +1,491 @@ +// vim: ft=arm + + .arm + .text + .global armvfpv2_mmm_f32_4x4_{{suffix}} + .type armvfpv2_mmm_f32_4x4_{{suffix}}, %function + +// C tile: + +// s16 s20 s24 s28 +// s17 s21 s25 s29 +// s18 s22 s26 s30 +// s19 s23 s27 s31 + +// packed A: (2x4) alternating between (s0-s3) and (s4-s7) +// packed B: (2x4) alternating between (s8-s11) and (s12-15) + +// all vfp registers in use. + +armvfpv2_mmm_f32_4x4_{{suffix}}: + +/* + pld [r1] + pld [r1, #8] + pld [r2] + pld [r2, #8] +*/ + + push { r4-r12 } // no lr (we're a leaf), no fp. #24 bytes + + ldr r8, [sp, #28] + ldr r9, [sp, #24] + +// r8=rsc, r9=csc + + vmrs r6, FPSCR + bic r6, r6, #0x00370000 + vmsr FPSCR, r6 + + vpush { s16-s31 } + +{% include "dispatcher.tmpliq" %} + +.clear: + eor r6, r6 + vmov s16, r6 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + b .non_linear_loop + +.add_mat_mul: + // r3 <- k, r4 <- a, r5 <- b + cmp r3, #0 + beq .non_linear_loop + + mov r1, r4 // packed A ptr + pld [r3] + pld [r5] + + .packed_packed: + cmp r3, #4 + blt .packed_packed_loop_1 + + .packed_packed_loop_4: + + // 1 + vldmia r1!, { s0, s1 } + vldmia r5!, { s8, s9 } + + vmla.f32 s16, s0, s8 + vldmia r1!, { s2, s3 } + vmla.f32 s17, s1, s8 + vldmia r5!, { s10, s11 } + vmla.f32 s18, s2, s8 + vmla.f32 s19, s3, s8 + + vmla.f32 s20, s0, s9 + vmla.f32 s21, s1, s9 + vmla.f32 s22, s2, s9 + vmla.f32 s23, s3, s9 + + vldmia r1!, { s4-s7 } + vmla.f32 s24, s0, s10 + vmla.f32 s25, s1, s10 + vmla.f32 s26, s2, s10 + vmla.f32 s27, s3, s10 + + vldmia r5!, { s12-s15 } + vmla.f32 s28, s0, s11 + vmla.f32 s29, s1, s11 + vmla.f32 s30, s2, s11 + vmla.f32 s31, s3, s11 + + // 2 + vmla.f32 s16, s4, s12 + vmla.f32 s17, s5, s12 + vmla.f32 s18, s6, s12 + vmla.f32 s19, s7, s12 + + vldmia r1!, { s0-s3 } + + vmla.f32 s20, s4, s13 + vmla.f32 s21, s5, s13 + vmla.f32 s22, s6, s13 + vmla.f32 s23, s7, s13 + + vldmia r5!, { s8-s11 } + + vmla.f32 s24, s4, s14 + vmla.f32 s25, s5, s14 + vmla.f32 s26, s6, s14 + vmla.f32 s27, s7, s14 + + vmla.f32 s28, s4, s15 + vmla.f32 s29, s5, s15 + vmla.f32 s30, s6, s15 + vmla.f32 s31, s7, s15 + + // 3 + vmla.f32 s16, s0, s8 + vmla.f32 s17, s1, s8 + vmla.f32 s18, s2, s8 + vmla.f32 s19, s3, s8 + + vldmia r1!, { s4-s7 } + + vmla.f32 s20, s0, s9 + vmla.f32 s21, s1, s9 + vmla.f32 s22, s2, s9 + vmla.f32 s23, s3, s9 + + vldmia r5!, { s12-s15 } + + vmla.f32 s24, s0, s10 + vmla.f32 s25, s1, s10 + vmla.f32 s26, s2, s10 + vmla.f32 s27, s3, s10 + + pld [r1] + + vmla.f32 s28, s0, s11 + vmla.f32 s29, s1, s11 + vmla.f32 s30, s2, s11 + vmla.f32 s31, s3, s11 + + pld [r6] + + // 4 + vmla.f32 s16, s4, s12 + vmla.f32 s17, s5, s12 + vmla.f32 s18, s6, s12 + vmla.f32 s19, s7, s12 + + vmla.f32 s20, s4, s13 + vmla.f32 s21, s5, s13 + vmla.f32 s22, s6, s13 + vmla.f32 s23, s7, s13 + + vmla.f32 s24, s4, s14 + vmla.f32 s25, s5, s14 + vmla.f32 s26, s6, s14 + vmla.f32 s27, s7, s14 + + vmla.f32 s28, s4, s15 + vmla.f32 s29, s5, s15 + vmla.f32 s30, s6, s15 + vmla.f32 s31, s7, s15 + + sub r3, r3, #4 + cmp r3, #4 + bge .packed_packed_loop_4 + + cmp r3, #0 + beq .non_linear_loop + + .packed_packed_loop_1: + + vldmia r1!, { s0, s1 } + vldmia r5!, { s8, s9 } + + vmla.f32 s16, s0, s8 + vldmia r1!, { s2, s3 } + vmla.f32 s17, s1, s8 + vldmia r5!, { s10, s11 } + vmla.f32 s18, s2, s8 + vmla.f32 s19, s3, s8 + + vmla.f32 s20, s0, s9 + vmla.f32 s21, s1, s9 + vmla.f32 s22, s2, s9 + vmla.f32 s23, s3, s9 + + vmla.f32 s24, s0, s10 + vmla.f32 s25, s1, s10 + vmla.f32 s26, s2, s10 + vmla.f32 s27, s3, s10 + + vmla.f32 s28, s0, s11 + vmla.f32 s29, s1, s11 + vmla.f32 s30, s2, s11 + vmla.f32 s31, s3, s11 + + subs r3, r3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +.add_unicast: + {% for col in (0..3) %} + mov r8, r3 + {% for reg in (0..3) %} + vldr s0, [ r8 ] + vadd.f32 s{{col|times:4|plus:reg|plus:16}}, s{{col|times:4|plus:reg|plus:16}}, s0 + {% if reg < 3 %} + add r8, r8, r4 + {% endif %} + {% endfor %} + {% if col < 3 %} + add r3, r3, r5 + {% endif %} + {% endfor %} + + b .non_linear_loop + +.scalar_min: + vmov s0, r3 + {% for reg in (16..31) %} + vcmp.f32 s{{reg}}, s0 + vmrs apsr_nzcv, fpscr + vmovge s{{reg}}, s0 + {% endfor %} + + b .non_linear_loop + +.scalar_max: + vmov s0, r3 + {% for reg in (16..31) %} + vcmp.f32 s{{reg}}, s0 + vmrs apsr_nzcv, fpscr + vmovle s{{reg}}, s0 + {% endfor %} + + b .non_linear_loop + +.scalar_add: + vmov s0, r3 + {% for s in (16..31) %} + vadd.f32 s{{s}}, s{{s}}, s0 + {% endfor %} + + b .non_linear_loop + +.scalar_mul: + vmov s0, r3 + {% for s in (16..31) %} + vmul.f32 s{{s}}, s{{s}}, s0 + {% endfor %} + + b .non_linear_loop + +.scalar_sub: + vmov s0, r3 + {% for s in (16..31) %} + vsub.f32 s{{s}}, s0, s{{s}} + {% endfor %} + + b .non_linear_loop + +.scalar_sub_flipped: + vmov s0, r3 + {% for s in (16..31) %} + vsub.f32 s{{s}}, s{{s}}, s0 + {% endfor %} + + b .non_linear_loop + +.leaky_relu: + vmov s0, r3 + {% for reg in (16..31) %} + vmul.f32 s1, s0, s{{reg}} + vcmp.f32 s{{reg}}, #0 + vmrs apsr_nzcv, fpscr + vmovlt s{{reg}}, s1 + {% endfor %} + b .non_linear_loop + +.per_row_min: + vldm r3, {s0, s1, s2, s3} + {% for row in (0..3) %} + {% for col in (0..3) %} + {%capture s%}s{{col|times:4|plus:row|plus:16}}{%endcapture%} + vcmp.f32 {{s}}, s{{row}} + vmrs apsr_nzcv, fpscr + vmovge {{s}}, s{{row}} + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.per_row_max: + vldm r3, {s0, s1, s2, s3} + {% for row in (0..3) %} + {% for col in (0..3) %} + {%capture s%}s{{col|times:4|plus:row|plus:16}}{%endcapture%} + vcmp.f32 {{s}}, s{{row}} + vmrs apsr_nzcv, fpscr + vmovlt {{s}}, s{{row}} + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.per_row_add: + vldm r3, {s0, s1, s2, s3} + {% for row in (0..3) %} + {% for col in (0..3) %} + vadd.f32 s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{row}} + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.per_row_mul: + vldm r3, {s0, s1, s2, s3} + {% for row in (0..3) %} + {% for col in (0..3) %} + vmul.f32 s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{row}} + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.per_row_sub: + vldm r3, {s0, s1, s2, s3} + {% for row in (0..3) %} + {% for col in (0..3) %} + vsub.f32 s{{col|times:4|plus:row|plus:16}}, s{{row}}, s{{col|times:4|plus:row|plus:16}} + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.per_row_sub_flipped: + vldm r3, {s0, s1, s2, s3} + {% for row in (0..3) %} + {% for col in (0..3) %} + vsub.f32 s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{row}} + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.per_col_min: + vldm r3, {s0, s1, s2, s3} + {% for row in (0..3) %} + {% for col in (0..3) %} + {%capture s%}s{{col|times:4|plus:row|plus:16}}{%endcapture%} + vcmp.f32 {{s}}, s{{col}} + vmrs apsr_nzcv, fpscr + vmovge {{s}}, s{{col}} + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.per_col_max: + vldm r3, {s0, s1, s2, s3} + {% for row in (0..3) %} + {% for col in (0..3) %} + {%capture s%}s{{col|times:4|plus:row|plus:16}}{%endcapture%} + vcmp.f32 {{s}}, s{{col}} + vmrs apsr_nzcv, fpscr + vmovlt {{s}}, s{{col}} + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.per_col_add: + vldm r3, {s0, s1, s2, s3} + {% for row in (0..3) %} + {% for col in (0..3) %} + vadd.f32 s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{col}} + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.per_col_mul: + vldm r3, {s0, s1, s2, s3} + {% for row in (0..3) %} + {% for col in (0..3) %} + vmul.f32 s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{col}} + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.per_col_sub: + vldm r3, {s0, s1, s2, s3} + {% for row in (0..3) %} + {% for col in (0..3) %} + vsub.f32 s{{col|times:4|plus:row|plus:16}}, s{{col}}, s{{col|times:4|plus:row|plus:16}} + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.per_col_sub_flipped: + vldm r3, {s0, s1, s2, s3} + {% for row in (0..3) %} + {% for col in (0..3) %} + vsub.f32 s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{col}} + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + vldmia r3!, { s0, s1 } + vldmia r4!, { s8, s9 } + + vmla.f32 s16, s0, s8 + vldmia r3!, { s2, s3 } + vmla.f32 s17, s1, s8 + vldmia r4!, { s10, s11 } + vmla.f32 s18, s2, s8 + vmla.f32 s19, s3, s8 + + vmla.f32 s20, s0, s9 + vmla.f32 s21, s1, s9 + vmla.f32 s22, s2, s9 + vmla.f32 s23, s3, s9 + + vmla.f32 s24, s0, s10 + vmla.f32 s25, s1, s10 + vmla.f32 s26, s2, s10 + vmla.f32 s27, s3, s10 + + vmla.f32 s28, s0, s11 + vmla.f32 s29, s1, s11 + vmla.f32 s30, s2, s11 + vmla.f32 s31, s3, s11 + + b .non_linear_loop + +.store: + {% for col in (0..3) %} + mov r8, r3 + {% for reg in (0..3) %} + fsts s{{col|times:4|plus:reg|plus:16}}, [ r8 ] + {% if reg < 3 %} + add r8, r8, r4 + {% endif %} + {% endfor %} + {% if col < 3 %} + add r3, r3, r5 + {% endif %} + {% endfor %} + + mov r0, #0 + b .return + +.load_tile: + vldmia r3!, { s16-s31 } + b .non_linear_loop + +.q_scale: +.q_shl: +.q_shr: + b .unsupported + +.return: + vpop { s16-s31 } + pop { r4-r12 } + + bx lr + diff --git a/vendor/tract-linalg-0.22.1/arm32/armvfpv2/dispatcher.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armvfpv2/dispatcher.tmpliq new file mode 100644 index 000000000..5386a420d --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm32/armvfpv2/dispatcher.tmpliq @@ -0,0 +1,32 @@ +// vim: ft=arm + +.non_linear: + +.non_linear_loop_entry: + sub r0, #20 + +.non_linear_loop: + add r0, #20 + ldm r0, { r2, r3, r4, r5, r6 } + + cmp r2, #{{ jump_table | size }} + movgt r2, #{{ jump_table | size }} + cmp r2, #0 + movlt r2, #{{ jump_table | size }} + + add pc, pc, r2, LSL#2 + nop // pc in Rn above is start of the add instruction + 8, hence a nop is needed + // This is A32 asm, for T32/Thump2 use nop.w and b.w to avoid problems. +{% for j in jump_table %} + b .{{j}} +{% endfor %} + b .unsupported + +.unsupported: + mov r0, #1 + b .return + +.done: + mov r0, #0 + b .return + diff --git a/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x1.tmpl b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x1.tmpl new file mode 100644 index 000000000..8ff4125d4 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x1.tmpl @@ -0,0 +1,533 @@ +// vim: ft=arm +.text +.align 4 + +/* Z: 32x1 + z0[0] .. z0[15] z1[0] .. z1[15] +*/ + + +.global {{G}}apple_amx_mmm_f16_64x1_{{suffix}} +{{G}}apple_amx_mmm_f16_64x1_{{suffix}}: + +{{ AMX_SET }} + + // set x1 to a 128 bytes aligned block for loads + mov x1, sp + lsr x1, x1, #7 + lsl x1, x1, #7 + sub x1, x1, 128 + +{% include "dispatcher.tmpliq" %} + +.leaky_relu: +.q_scale: +.q_shl: +.q_shr: + b .unsupported + +.add_mat_mul: + + ldr x2, [x0, #24] // b + ldp x3, x4, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + + orr x4, x4, {{ 0|setting:62 }} // load a pair of A + + mov x5, {{ 0|setting:43 }} // f16 + orr x5, x5, {{ 0|setting:38 }} // Broadcast Y + + orr x6, x5, {{ 0|setting:20 }} // z offset + orr x6, x6, {{ 0|setting:16 }} // x offset + + cmp x3, #32 + blt .packed_packed_loop_1 + + mov x9, {{0|setting:32}} // Y broadcast offset += 1 + + .packed_packed_loop_32: + mov x7, x5 + mov x8, x6 + {% amx ldy x2 %} + {% for k in (0..31) %} + {% amx ldx x4 %} + add x4, x4, 128 + {% amx vecfp x7 %} + {% amx vecfp x8 %} + add x7, x7, x9 + add x8, x8, x9 + {% endfor %} + add x2, x2, #64 + sub x3, x3, #32 + cmp x3, #32 + bge .packed_packed_loop_32 + + cmp x3, #0 + beq .non_linear_loop + + .packed_packed_loop_1: + ldr w7, [x2], #2 + str w7, [x1] + {% amx ldx x4 %} + {% amx ldy x1 %} + {% amx vecfp x5 %} + {% amx vecfp x6 %} + add x4, x4, 128 + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +.clear: + // top left + eor x2, x2, x2 + orr x2, x2, {{ 0|setting:27 }} + orr x2, x2, {{ 0|setting:28 }} + orr x2, x2, {{ 0|setting:29 }} // Z = 0 + {% amx fma32 x2 %} + + // top right + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + {% amx fma32 x2 %} + + // bottom right + orr x2, x2, {{ 0|setting:21 }} // Z row = 3 + {% amx fma32 x2 %} + + // bottom left + eor x2, x2, {{ 0|setting:20 }} // Z row = 2 + {% amx fma32 x2 %} + + b .non_linear_loop + +.per_col_sub: + + // performs a unary neg on Z + eor x2, x2, x2 // X[0] = Z[0] + // extr[hxyz] is suport confusing + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:28 }} + orr x4, x4, {{ 0|setting:27 }} // Z=-X + + {% amx extrx x2 %} + {% amx fms16 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + {% amx extrx x2 %} // extr[hxyz] is confusing + {% amx fms16 x4 %} + + // continue + +.per_col_add: + ldr x2, [x0, #8] + + // broadcast value to x0 + ld1 { v0.h }[0], [x2] + dup v0.8h, v0.h[0] + st1 { v0.8h }, [x1], #16 + st1 { v0.8h }, [x1], #16 + st1 { v0.8h }, [x1], #16 + st1 { v0.8h }, [x1], #16 + sub x1, x1, #64 + + {% amx ldx x1 %} // load into x0 by default + + mov x2, {{ 0|setting:28 }} // z += y + {% amx fma16 x2 %} + + orr x2, x2, {{ 0|setting:20 }} // target is now z1 + {% amx fma16 x2 %} + + b .non_linear_loop + +.per_col_sub_flipped: + ldr x2, [x0, #8] + + // broadcast value to x0 + ld1 { v0.h }[0], [x2] + dup v0.8h, v0.h[0] + st1 { v0.8h }, [x1], #16 + st1 { v0.8h }, [x1], #16 + st1 { v0.8h }, [x1], #16 + st1 { v0.8h }, [x1], #16 + sub x1, x1, #64 + + {% amx ldx x1 %} // load into x0 by default + + mov x2, {{ 0|setting:28 }} // z += y + {% amx fms16 x2 %} + + orr x2, x2, {{ 0|setting:20 }} // target is now z1 + {% amx fms16 x2 %} + + b .non_linear_loop + +.per_row_sub_flipped: + ldr x2, [x0, #8] + + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x2], #64 + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1], #64 + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x2] + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // load a pair + {% amx ldy x2 %} + + mov x2, {{ 0|setting:63 }} // vector mode + orr x2, x2, {{ 0|setting:29 }} // z -= y + + // top left + {% amx fms16 x2 %} + + // bottom left + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + orr x2, x2, {{ 0|setting:6 }} // Y offset + {% amx fms16 x2 %} + + b .non_linear_loop + +.per_row_sub: + // performs a unary neg on Z + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:28 }} + orr x4, x4, {{ 0|setting:27 }} // Z=-X + + {% amx extrx x2 %} + {% amx fms16 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + {% amx extrx x2 %} + {% amx fms16 x4 %} + + // continue + +.per_row_add: + ldr x2, [x0, #8] + + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x2], #64 + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1], #64 + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x2] + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // load a pair + {% amx ldy x2 %} + + mov x2, {{ 0|setting:63 }} // vector mode + orr x2, x2, {{ 0|setting:29 }} // z += y + + // top left + {% amx fma16 x2 %} + + // bottom left + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + orr x2, x2, {{ 0|setting:6 }} // Y offset + {% amx fma16 x2 %} + + b .non_linear_loop + +.per_row_min: + mov x2, 5 + b .per_row_min_max +.per_row_max: + mov x2, 7 +.per_row_min_max: + ldr x5, [x0, #8] + + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x5], #64 + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1], #64 + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x5] + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + sub x1, x1, #64 + + orr x5, x1, {{ 0|setting:62 }} // load a pair + {% amx ldx x5 %} + + lsl x2, x2, 47 // max(x,z) (or min) + orr x2, x2, {{ 0|setting:44 }} // f32 + {% amx vecfp x2 %} + + orr x2, x2, {{ 0|setting:16 }} // x1 + orr x2, x2, {{ 0|setting:20 }} // z1 + {% amx vecfp x2 %} + + b .non_linear_loop + +.per_col_min: + mov x2, 5 + b .per_col_min_max +.per_col_max: + mov x2, 7 +.per_col_min_max: + ldr x4, [x0, #8] + + // broadcast value to x0 + ld1 { v0.h }[0], [x4] + dup v0.8h, v0.h[0] + st1 { v0.8h }, [x1], #16 + st1 { v0.8h }, [x1], #16 + st1 { v0.8h }, [x1], #16 + st1 { v0.8h }, [x1], #16 + sub x1, x1, #64 + + {% amx ldx x1 %} + + lsl x2, x2, 47 // max(x,z) (or min) + orr x2, x2, {{ 0|setting:43 }} // f32 + + {% amx vecfp x2 %} + orr x2, x2, {{ 0|setting:20 }} // z offset + {% amx vecfp x2 %} + + b .non_linear_loop + +.per_col_mul: + ldr x4, [x0, #8] + + // broadcast value to y0 + ld1 { v0.h }[0], [x4] + dup v0.8h, v0.h[0] + st1 { v0.8h }, [x1], #16 + st1 { v0.8h }, [x1], #16 + st1 { v0.8h }, [x1], #16 + st1 { v0.8h }, [x1], #16 + sub x1, x1, #64 + + {% amx ldy x1 %} + + eor x2, x2, x2 // X[0] = Z[0] + {% amx extrx x2 %} + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:27 }} // Z=X*Y + {% amx fma16 x4 %} + orr x2, x2, {{ 0|setting:20 }} // Z1 + {% amx extrx x2 %} + orr x4, x4, {{ 0|setting:20 }} // Z1 + {% amx fma16 x4 %} + + b .non_linear_loop + +.per_row_mul: + ldr x2, [x0, #8] + + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x2], #64 + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1], #64 + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x2] + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // pair + {% amx ldy x2 %} + + eor x2, x2, x2 // X[0] = Z[0] + {% amx extrx x2 %} + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:27 }} // Z=X*Y + {% amx fma16 x4 %} + orr x2, x2, {{ 0|setting:20 }} // Z1 + {% amx extrx x2 %} + orr x4, x4, {{ 0|setting:20 }} // Z1 + orr x4, x4, {{ 0|setting:6 }} // Y1 + {% amx fma16 x4 %} + + b .non_linear_loop + +.scalar_sub: + // performs a unary neg on Z, then go to scalar_add + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:28 }} + orr x4, x4, {{ 0|setting:27 }} // Z=-X + {% amx extrx x2 %} + {% amx fms16 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + {% amx extrx x2 %} + {% amx fms16 x4 %} + + // continue on purpose + +.scalar_add: + ldr w5, [x0, #8] + + fmov h0, w5 + dup v0.8h, v0.h[0] + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[0] + dup v3.8h, v0.h[0] + + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + {% amx ldx x1 %} // load 16 values + + mov x2, {{ 0|setting:28 }} // Z+=X + {% amx fma16 x2 %} + add x2, x2, {{0|setting:20}} // next Z row + {% amx fma16 x2 %} + b .non_linear_loop + +.scalar_sub_flipped: + ldr w5, [x0, #8] + fmov s0, w5 + dup v0.8h, v0.h[0] + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[0] + dup v3.8h, v0.h[0] + + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + {% amx ldx x1 %} // load 16 values + + mov x2, {{ 0|setting:28 }} // Z-=X + {% amx fms16 x2 %} + add x2, x2, {{0|setting:20}} // next Z row + {% amx fms16 x2 %} + b .non_linear_loop + +.scalar_mul: + ldr w5, [x0, #8] + fmov h0, w5 + dup v0.8h, v0.h[0] + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[0] + dup v3.8h, v0.h[0] + + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + {% amx ldy x1 %} + + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:27 }} // Z=X*Y + + {% amx extrx x2 %} + {% amx fma16 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + {% amx extrx x2 %} + {% amx fma16 x4 %} + + b .non_linear_loop + +.scalar_min: + mov x2, 5 + b .scalar_min_max +.scalar_max: + mov x2, 7 +.scalar_min_max: + ldr w5, [x0, #8] + fmov h0, w5 + dup v0.8h, v0.h[0] + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[0] + dup v3.8h, v0.h[0] + + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + {% amx ldx x1 %} // load 16 values + + lsl x2, x2, 47 + orr x2, x2, {{ 0|setting:43 }} // f16 + + {% amx vecfp x2 %} + add x2, x2, {{ 0|setting:20}} // next Z + {% amx vecfp x2 %} + + b .non_linear_loop + +.add_unicast: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + {% for neon in (0..7) %} + {% for lane in (0..7) %} + ld1 { v{{neon}}.h }[{{lane}}], [x5], x6 + {% endfor %} + {% endfor %} + mov x8, x1 + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x8], #64 + st1 { v4.8h, v5.8h, v6.8h, v7.8h }, [x8], #64 + + orr x8, x1, {{ 0|setting:62 }} // pair + {% amx ldy x8 %} + + eor x2, x2, x2 + orr x2, x2, {{ 0|setting:63 }} // vector mode + orr x2, x2, {{ 0|setting:29 }} // perform Z0+=Y0 + {% amx fma16 x2 %} + orr x2, x2, {{ 0|setting:20 }} // Z1 + orr x2, x2, 64 // offset Y + {% amx fma16 x2 %} + + b .non_linear_loop + +.add_row_col_products: + ldp x5, x6, [x0, #8] // a base ptr, b base ptr + + ld1 { v0.h }[0], [x6] + st1 { v0.h }[0], [x1] + {% amx ldy x1 %} + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x5], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x5] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // load a pair + {% amx ldx x2 %} + + mov x2, {{ 0|setting:43 }} // f16 + orr x2, x2, {{ 0|setting:38 }} // Broadcast Y + {% amx vecfp x2 %} + + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + orr x2, x2, {{ 0|setting:16 }} // X offset + {% amx vecfp x2 %} + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + ands x8, x5, 0x7f + bne .store_generic + cmp x6, 4 + bne .store_generic + cmp x7, 4 + bne .store_generic + + orr x5, x5, {{ 0|setting:62 }} // pair + {% amx stz x5 %} + b .non_linear_loop + + .store_generic: + + orr x8, x1, {{ 0|setting:62 }} // pair + {% amx stz x8 %} + + mov x8, x1 + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x8], #64 + ld1 { v4.8h, v5.8h, v6.8h, v7.8h }, [x8], #64 + {% for neon in (0..7) %} + {% for lane in (0..7) %} + st1 { v{{neon}}.h }[{{lane}}], [x5], x6 + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.load_tile: + ldr x2, [x0, #16] // row major ptr + orr x2, x2, {{0|setting:62}} // load pairs + {% amx ldz x2 %} + b .non_linear_loop + +.return: +{{ AMX_CLR }} +ret diff --git a/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x32.tmpl b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x32.tmpl new file mode 100644 index 000000000..5c5bcea19 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x32.tmpl @@ -0,0 +1,658 @@ +// vim: ft=arm +.text +.align 4 + +/* Z: 64x32 tile. each Z reg is f16x32 + Z0 + Z2 + ... + Z62 + + Z1 + Z3 + S63 +*/ + + +.global {{G}}apple_amx_mmm_f16_64x32_{{suffix}} +{{G}}apple_amx_mmm_f16_64x32_{{suffix}}: + +{{ AMX_SET }} + + // set x1 to a 128 bytes aligned block for loads + mov x1, sp + lsr x1, x1, #7 + lsl x1, x1, #7 + sub x1, x1, 128 + +{% include "dispatcher.tmpliq" %} + +.leaky_relu: +.q_scale: +.q_shl: +.q_shr: + b .unsupported + +.add_mat_mul: + + ldr x2, [x0, #24] // b + ldp x3, x4, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + + orr x4, x4, {{0|setting:62}} // load pairs (A) + + eor x5, x5, x5 // top left + + orr x7, x5, {{ 0|setting:20 }} + orr x7, x7, {{ 0|setting:6 }} // bottom left + + .packed_packed_loop_1: + {% amx ldx x2 %} + {% amx ldy x4 %} + add x2, x2, 64 + add x4, x4, 128 + + {% amx fma16 x5 %} + {% amx fma16 x7 %} + + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +.clear: + // top left + eor x2, x2, x2 + orr x2, x2, {{ 0|setting:27 }} + orr x2, x2, {{ 0|setting:28 }} + orr x2, x2, {{ 0|setting:29 }} // Z = 0 + {% amx fma32 x2 %} + + // top right + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + {% amx fma32 x2 %} + + // bottom right + orr x2, x2, {{ 0|setting:21 }} // Z row = 3 + {% amx fma32 x2 %} + + // bottom left + eor x2, x2, {{ 0|setting:20 }} // Z row = 2 + {% amx fma32 x2 %} + + mov x3, #16 + str x3, [x1] + + b .non_linear_loop + +.per_col_sub: + + // performs a unary neg on Z + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:28 }} + orr x4, x4, {{ 0|setting:27 }} // Z=-X + + mov x6, 64 + .per_col_sub_loop: + {% amx extrx x2 %} + {% amx fms16 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + subs x6, x6, 1 + bne .per_col_sub_loop + + // continue + +.per_col_add: + ldr x2, [x0, #8] + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + {% amx ldx x1 %} + + mov x2, {{ 0|setting:28 }} // z += y + + // top left + {% amx fma16 x2 %} + + // bottom left + orr x2, x2, {{ 0|setting:20 }} // Z row = 2 + {% amx fma16 x2 %} + + b .non_linear_loop + +.per_col_sub_flipped: + ldr x2, [x0, #8] + + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x2] + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + + {% amx ldx x1 %} + + mov x2, {{ 0|setting:28 }} // z += y + + {% amx fms16 x2 %} + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + {% amx fms16 x2 %} + + b .non_linear_loop + +.per_row_sub_flipped: + ldr x2, [x0, #8] + + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x2], #64 + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1], #64 + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x2] + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // load a pair + {% amx ldy x2 %} + + mov x2, {{ 0|setting:29 }} // z += y + + // top left + {% amx fms16 x2 %} + + // bottom right + orr x2, x2, {{ 0|setting:20 }} // Z row = 3 + orr x2, x2, {{ 0|setting:6 }} // Y offset + {% amx fms16 x2 %} + + b .non_linear_loop + +.per_row_sub: + // performs a unary neg on Z + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:28 }} + orr x4, x4, {{ 0|setting:27 }} // Z=-X + + mov x6, 64 + .per_row_sub_loop: + {% amx extrx x2 %} + {% amx fms16 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + subs x6, x6, 1 + bne .per_row_sub_loop + + // continue + +.per_row_add: + ldr x2, [x0, #8] + + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x2], #64 + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1], #64 + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x2] + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // load a pair + {% amx ldy x2 %} + + mov x2, {{ 0|setting:29 }} // z += y + + // top left + {% amx fma16 x2 %} + + // bottom right + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + orr x2, x2, {{ 0|setting:6 }} // Y offset + {% amx fma16 x2 %} + + b .non_linear_loop + +.per_row_min: + mov x2, 5 + b .per_row_min_max +.per_row_max: + mov x2, 7 +.per_row_min_max: + ldr x5, [x0, #8] + + add x6, x5, 64 + + lsl x2, x2, 47 // max(x,z) (or min) + orr x2, x2, {{ 0|setting:43 }} // f16 + + orr x8, x2, {{ 0|setting:20 }} // bottom left + + mov x4, 32 + .loop_per_row_max: + // top half + ld1 { v0.h }[0], [x5], #2 + dup v0.8h, v0.h[0] + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[0] + dup v3.8h, v0.h[0] + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + + {% amx ldx x1 %} + {% amx vecfp x2 %} + + add x2, x2, {{ 0|setting:21 }} + + // bottom half + ld1 { v0.h }[0], [x6], #2 + dup v0.8h, v0.h[0] + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[0] + dup v3.8h, v0.h[0] + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + + {% amx ldx x1 %} + {% amx vecfp x8 %} + + add x8, x8, {{ 0|setting:21 }} + + subs x4, x4, 1 + bne .loop_per_row_max + + b .non_linear_loop + +.per_col_min: + mov x2, 5 + b .per_col_min_max +.per_col_max: + mov x2, 7 +.per_col_min_max: + ldr x4, [x0, #8] + + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x4] + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + {% amx ldx x1 %} + + lsl x2, x2, 47 // max(x,z) (or min) + orr x2, x2, {{ 0|setting:43 }} // f16 + + mov x4, 64 + .loop_per_col_max: + {% amx vecfp x2 %} + add x2, x2, {{ 0|setting:20 }} + subs x4, x4, 1 + bne .loop_per_col_max + + b .non_linear_loop + +.per_col_mul: + ldr x4, [x0, #8] + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x4] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + {% amx ldy x1 %} + + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:27 }} // Z=X*Y + + mov x6, 64 + .loop_per_col_mul: + {% amx extrx x2 %} + {% amx fma16 x4 %} + add x2, x2, {{0|setting:20}} + add x4, x4, {{0|setting:20}} + subs x6, x6, 1 + bne .loop_per_col_mul + + b .non_linear_loop + +.per_row_mul: + ldr x14, [x0, #8] + add x15, x14, 64 + + // extrx + eor x2, x2, x2 // X[0] = Z[0] (top left) + + eor x4, x4, x4 + orr x4, x4, {{0|setting:20}} // X[0] = Z[1] (bottom left) + + // fma16 + eor x6, x6, x6 + orr x6, x6, {{0|setting:63}} // vector mode + orr x6, x6, {{0|setting:27}} // Z=X*Y Z[0]=X[0]*Y[0] + + orr x8, x6, {{0|setting:20}} // Z[1] + + mov x10, 32 + .loop_per_row_mul: + // top + ld1 { v0.h }[0], [x14], #2 + dup v0.8h, v0.h[0] + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[0] + dup v3.8h, v0.h[0] + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + + {% amx ldy x1 %} + {% amx extrx x2 %} + {% amx fma16 x6 %} + + add x2, x2, {{ 0|setting:21 }} + add x6, x6, {{ 0|setting:21 }} + + // bottom + ld1 { v0.h }[0], [x15], #2 + dup v0.8h, v0.h[0] + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[0] + dup v3.8h, v0.h[0] + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + + {% amx ldy x1 %} + {% amx extrx x4 %} + {% amx fma16 x8 %} + + add x4, x4, {{ 0|setting:21 }} + add x8, x8, {{ 0|setting:21 }} + + subs x10, x10, 1 + bne .loop_per_row_mul + + b .non_linear_loop + +.scalar_sub: + // performs a unary neg on Z, then go to scalar_add + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:28 }} + orr x4, x4, {{ 0|setting:27 }} // Z=-X + + mov x6, 64 + .scalar_sub_loop: + {% amx extrx x2 %} + {% amx fms16 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + subs x6, x6, 1 + bne .scalar_sub_loop + + // continue on purpose + +.scalar_add: + ldr w5, [x0, #8] + + fmov h0, w5 + dup v0.8h, v0.h[0] + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[0] + dup v3.8h, v0.h[0] + + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + {% amx ldx x1 %} // load 16 values + + mov x2, {{ 0|setting:28 }} // Z+=X + {% amx fma16 x2 %} + add x2, x2, {{0|setting:20}} // Z1 + {% amx fma16 x2 %} + b .non_linear_loop + +.scalar_sub_flipped: + ldr w5, [x0, #8] + + fmov h0, w5 + dup v0.8h, v0.h[0] + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[0] + dup v3.8h, v0.h[0] + + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + {% amx ldx x1 %} // load 32 values + + mov x2, {{ 0|setting:28 }} // Z-=X + {% amx fms16 x2 %} + add x2, x2, {{0|setting:20}} // next Z row + {% amx fms16 x2 %} + b .non_linear_loop + +.scalar_mul: + ldr w5, [x0, #8] + + fmov h0, w5 + dup v0.8h, v0.h[0] + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[0] + dup v3.8h, v0.h[0] + + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + {% amx ldy x1 %} // load 32 values + + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:27 }} // Z=X*Y + + mov x6, 64 + .scalar_mul_loop: + {% amx extrx x2 %} + {% amx fma16 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + subs x6, x6, 1 + bne .scalar_mul_loop + + b .non_linear_loop + +.scalar_min: + mov x2, 5 + b .scalar_min_max +.scalar_max: + mov x2, 7 +.scalar_min_max: + ldr w5, [x0, #8] + + fmov h0, w5 + dup v0.8h, v0.h[0] + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[0] + dup v3.8h, v0.h[0] + + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + {% amx ldx x1 %} // load 16 values + + lsl x2, x2, 47 + orr x2, x2, {{ 0|setting:43 }} // f32 + + mov x3, 64 + .loop_scalar_max: + add x2, x2, {{ 0|setting:20}} // next Z + {% amx vecfp x2 %} + subs x3, x3, 1 + bne .loop_scalar_max + + b .non_linear_loop + +.add_unicast: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + mov x3, 0 // x3 is the row + .loop_load: + // z reg is (row % 32) * 2 + (row / 32) + and x9, x3, 0x1f + lsl x9, x9, 1 + lsr x10, x3, 5 + add x9, x9, x10 + + mov x4, x5 + {% for neon in (0..3) %} + {% for lane in (0..7) %} + ld1 { v{{neon}}.h }[{{lane}}], [x4], x7 + {% endfor %} + {% endfor %} + + st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + {% amx ldy x1 %} + + lsl x2, x9, 20 // Z register to update + orr x2, x2, {{ 0|setting:63 }} // vector mode + orr x2, x2, {{ 0|setting:29 }} // perform Z+=Y + {% amx fma16 x2 %} + + add x5, x5, x6 + add x3, x3, 1 + cmp x3, 64 + bne .loop_load + + /* + mov x3, 0 // x3 is the row + .loop_load: + and x9, x3, 0xf // x9 = row % 16 + lsl x9, x9, 2 // x9 = (row % 16) * 4 + lsr x10, x3, 4 // x10 = row / 16 + lsl x10, x10, 1 // x10 = (row / 16) * 2 + add x9, x9, x10 // x9 = x9 + x10 + + mov x4, x5 + {% for neon in (0..3) %} + {% for lane in (0..3) %} + ld1 { v{{neon}}.s }[{{lane}}], [x4], x7 + {% endfor %} + {% endfor %} + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + {% for neon in (0..3) %} + {% for lane in (0..3) %} + ld1 { v{{neon}}.s }[{{lane}}], [x4], x7 + {% endfor %} + {% endfor %} + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x8] + + mov x2, x1 + orr x2, x2, {{ 0|setting:62 }} // load 32 values + {% amx ldy x2 %} + + lsl x2, x9, 20 // left Z register to update + orr x2, x2, {{ 0|setting:63 }} // vector mode + orr x2, x2, {{ 0|setting:29 }} // perform Z+=Y + {% amx fma32 x2 %} + + add x2, x2, {{0|setting:20}} + orr x2, x2, 64 // offset Y by 16 values + {% amx fma32 x2 %} + + add x5, x5, x6 + add x3, x3, 1 + cmp x3, 32 + bne .loop_load + +*/ + + b .non_linear_loop + +.add_row_col_products: + ldp x5, x6, [x0, #8] // a base ptr, b base ptr + + add x8, x1, 64 + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x5], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x5] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // load a pair + {% amx ldy x2 %} + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + + {% amx ldx x1 %} + + // top + eor x2, x2, x2 + {% amx fma16 x2 %} + + // bottom right + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + orr x2, x2, {{ 0|setting:6 }} // Y offset + {% amx fma16 x2 %} + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + cmp x7, 2 + bne .store_generic + ands x8, x5, 0x7f + bne .store_generic + ands x8, x6, 0x7f + bne .store_generic + + lsl x8, x6, 5 + add x8, x8, x5 // x8 = 32*rsc + orr x8, x8, {{ 0|setting:56 }} // first to x8 is z1 + + mov x4, {{0|setting:57}} // Zreg += 2 + add x4, x4, x6 // +rsc + + mov x3, 32 + .loop_store_direct: + {% amx stz x5 %} + {% amx stz x8 %} + add x5, x5, x4 + add x8, x8, x4 + subs x3, x3, 1 + bne .loop_store_direct + + b .non_linear_loop + +.store_generic: + + mov x3, 0 // row id + .loop_store: + // z reg is (row % 32) * 2 + (row / 32) + and x9, x3, 0x1f + lsl x9, x9, 1 + lsr x10, x3, 5 + add x9, x9, x10 + + lsl x2, x9, 56 + orr x2, x2, x1 + {% amx stz x2 %} // f16 x 32 + + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1] + + mov x4, x5 + {% for neon in (0..3) %} + {% for lane in (0..7) %} + st1 { v{{neon}}.h }[{{lane}}], [x4], x7 + {% endfor %} + {% endfor %} + add x5, x5, x6 + + add x3, x3, 1 + cmp x3, 64 + bne .loop_store + b .non_linear_loop + +.load_tile: + ldr x2, [x0, #16] // row major ptr + orr x3, x2, {{0|setting:56}} + add x3, x3, #2048 + + mov x4, {{0|setting:57}} // z+=2 + add x4, x4, #64 + + mov x8, 32 + .loop_load_tile: + {% amx ldz x2 %} + {% amx ldz x3 %} + add x2, x2, x4 + add x3, x3, x4 + subs x8, x8, 1 + bne .loop_load_tile + + b .non_linear_loop + +.return: +{{ AMX_CLR }} +ret diff --git a/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x1.tmpl b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x1.tmpl new file mode 100644 index 000000000..132283f7f --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x1.tmpl @@ -0,0 +1,533 @@ +// vim: ft=arm +.text +.align 4 + +/* Z: 32x1 + z0[0] .. z0[15] z1[0] .. z1[15] +*/ + + +.global {{G}}apple_amx_mmm_f32_32x1_{{suffix}} +{{G}}apple_amx_mmm_f32_32x1_{{suffix}}: + +{{ AMX_SET }} + + // set x1 to a 128 bytes aligned block for loads + mov x1, sp + lsr x1, x1, #7 + lsl x1, x1, #7 + sub x1, x1, 128 + +{% include "dispatcher.tmpliq" %} + +.leaky_relu: +.q_scale: +.q_shl: +.q_shr: + b .unsupported + +.add_mat_mul: + + ldr x2, [x0, #24] // b + ldp x3, x4, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + + orr x4, x4, {{ 0|setting:62 }} // load a pair of A + + mov x5, {{ 0|setting:44 }} // f32 + orr x5, x5, {{ 0|setting:38 }} // Broadcast Y + + orr x6, x5, {{ 0|setting:20 }} // z offset + orr x6, x6, {{ 0|setting:16 }} // x offset + + cmp x3, #16 + blt .packed_packed_loop_1 + + mov x9, {{0|setting:32}} // Y broadcast offset += 1 + + .packed_packed_loop_16: + mov x7, x5 + mov x8, x6 + {% amx ldy x2 %} + {% for k in (0..15) %} + {% amx ldx x4 %} + add x4, x4, 128 + {% amx vecfp x7 %} + {% amx vecfp x8 %} + add x7, x7, x9 + add x8, x8, x9 + {% endfor %} + add x2, x2, #64 + sub x3, x3, #16 + cmp x3, #16 + bge .packed_packed_loop_16 + + cmp x3, #0 + beq .non_linear_loop + + .packed_packed_loop_1: + ldr w7, [x2], #4 + str w7, [x1] + {% amx ldx x4 %} + {% amx ldy x1 %} + {% amx vecfp x5 %} + {% amx vecfp x6 %} + add x4, x4, 128 + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +.clear: + // top left + eor x2, x2, x2 + orr x2, x2, {{ 0|setting:27 }} + orr x2, x2, {{ 0|setting:28 }} + orr x2, x2, {{ 0|setting:29 }} // Z = 0 + {% amx fma32 x2 %} + + // top right + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + {% amx fma32 x2 %} + + // bottom right + orr x2, x2, {{ 0|setting:21 }} // Z row = 3 + {% amx fma32 x2 %} + + // bottom left + eor x2, x2, {{ 0|setting:20 }} // Z row = 2 + {% amx fma32 x2 %} + + b .non_linear_loop + +.per_col_sub: + + // performs a unary neg on Z + eor x2, x2, x2 // X[0] = Z[0] + // extr[hxyz] is suport confusing + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:28 }} + orr x4, x4, {{ 0|setting:27 }} // Z=-X + + {% amx extrx x2 %} + {% amx fms32 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + {% amx extrx x2 %} // extr[hxyz] is confusing + {% amx fms32 x4 %} + + // continue + +.per_col_add: + ldr x2, [x0, #8] + + // broadcast value to x0 + ld1 { v0.s }[0], [x2] + dup v0.4s, v0.s[0] + st1 { v0.4s }, [x1], #16 + st1 { v0.4s }, [x1], #16 + st1 { v0.4s }, [x1], #16 + st1 { v0.4s }, [x1], #16 + sub x1, x1, #64 + + {% amx ldx x1 %} // load into x0 by default + + mov x2, {{ 0|setting:28 }} // z += y + {% amx fma32 x2 %} + + orr x2, x2, {{ 0|setting:20 }} // target is now z1 + {% amx fma32 x2 %} + + b .non_linear_loop + +.per_col_sub_flipped: + ldr x2, [x0, #8] + + // broadcast value to x0 + ld1 { v0.s }[0], [x2] + dup v0.4s, v0.s[0] + st1 { v0.4s }, [x1], #16 + st1 { v0.4s }, [x1], #16 + st1 { v0.4s }, [x1], #16 + st1 { v0.4s }, [x1], #16 + sub x1, x1, #64 + + {% amx ldx x1 %} // load into x0 by default + + mov x2, {{ 0|setting:28 }} // z += y + {% amx fms32 x2 %} + + orr x2, x2, {{ 0|setting:20 }} // target is now z1 + {% amx fms32 x2 %} + + b .non_linear_loop + +.per_row_sub_flipped: + ldr x2, [x0, #8] + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // load a pair + {% amx ldy x2 %} + + mov x2, {{ 0|setting:63 }} // vector mode + orr x2, x2, {{ 0|setting:29 }} // z -= y + + // top left + {% amx fms32 x2 %} + + // bottom left + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + orr x2, x2, {{ 0|setting:6 }} // Y offset + {% amx fms32 x2 %} + + b .non_linear_loop + +.per_row_sub: + // performs a unary neg on Z + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:28 }} + orr x4, x4, {{ 0|setting:27 }} // Z=-X + + {% amx extrx x2 %} + {% amx fms32 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + {% amx extrx x2 %} + {% amx fms32 x4 %} + + // continue + +.per_row_add: + ldr x2, [x0, #8] + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // load a pair + {% amx ldy x2 %} + + mov x2, {{ 0|setting:63 }} // vector mode + orr x2, x2, {{ 0|setting:29 }} // z += y + + // top left + {% amx fma32 x2 %} + + // bottom left + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + orr x2, x2, {{ 0|setting:6 }} // Y offset + {% amx fma32 x2 %} + + b .non_linear_loop + +.per_row_min: + mov x2, 5 + b .per_row_min_max +.per_row_max: + mov x2, 7 +.per_row_min_max: + ldr x5, [x0, #8] + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x5], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x5] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x5, x1, {{ 0|setting:62 }} // load a pair + {% amx ldx x5 %} + + lsl x2, x2, 47 // max(x,z) (or min) + orr x2, x2, {{ 0|setting:44 }} // f32 + {% amx vecfp x2 %} + + orr x2, x2, {{ 0|setting:16 }} // x1 + orr x2, x2, {{ 0|setting:20 }} // z1 + {% amx vecfp x2 %} + + b .non_linear_loop + +.per_col_min: + mov x2, 5 + b .per_col_min_max +.per_col_max: + mov x2, 7 +.per_col_min_max: + ldr x4, [x0, #8] + + // broadcast value to x0 + ld1 { v0.s }[0], [x4] + dup v0.4s, v0.s[0] + st1 { v0.4s }, [x1], #16 + st1 { v0.4s }, [x1], #16 + st1 { v0.4s }, [x1], #16 + st1 { v0.4s }, [x1], #16 + sub x1, x1, #64 + + {% amx ldx x1 %} + + lsl x2, x2, 47 // max(x,z) (or min) + orr x2, x2, {{ 0|setting:44 }} // f32 + + {% amx vecfp x2 %} + orr x2, x2, {{ 0|setting:20 }} // z offset + {% amx vecfp x2 %} + + b .non_linear_loop + +.per_col_mul: + ldr x4, [x0, #8] + + // broadcast value to y0 + ld1 { v0.s }[0], [x4] + dup v0.4s, v0.s[0] + st1 { v0.4s }, [x1], #16 + st1 { v0.4s }, [x1], #16 + st1 { v0.4s }, [x1], #16 + st1 { v0.4s }, [x1], #16 + sub x1, x1, #64 + + {% amx ldy x1 %} + + eor x2, x2, x2 // X[0] = Z[0] + {% amx extrx x2 %} + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:27 }} // Z=X*Y + {% amx fma32 x4 %} + orr x2, x2, {{ 0|setting:20 }} // Z1 + {% amx extrx x2 %} + orr x4, x4, {{ 0|setting:20 }} // Z1 + {% amx fma32 x4 %} + + b .non_linear_loop + +.per_row_mul: + ldr x2, [x0, #8] + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // pair + {% amx ldy x2 %} + + eor x2, x2, x2 // X[0] = Z[0] + {% amx extrx x2 %} + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:27 }} // Z=X*Y + {% amx fma32 x4 %} + orr x2, x2, {{ 0|setting:20 }} // Z1 + {% amx extrx x2 %} + orr x4, x4, {{ 0|setting:20 }} // Z1 + orr x4, x4, {{ 0|setting:6 }} // Y1 + {% amx fma32 x4 %} + + b .non_linear_loop + +.scalar_sub: + // performs a unary neg on Z, then go to scalar_add + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:28 }} + orr x4, x4, {{ 0|setting:27 }} // Z=-X + {% amx extrx x2 %} + {% amx fms32 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + {% amx extrx x2 %} + {% amx fms32 x4 %} + + // continue on purpose + +.scalar_add: + ldr w5, [x0, #8] + + fmov s0, w5 + dup v0.4s, v0.s[0] + dup v1.4s, v0.s[0] + dup v2.4s, v0.s[0] + dup v3.4s, v0.s[0] + + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + {% amx ldx x1 %} // load 16 values + + mov x2, {{ 0|setting:28 }} // Z+=X + {% amx fma32 x2 %} + add x2, x2, {{0|setting:20}} // next Z row + {% amx fma32 x2 %} + b .non_linear_loop + +.scalar_sub_flipped: + ldr w5, [x0, #8] + fmov s0, w5 + dup v0.4s, v0.s[0] + dup v1.4s, v0.s[0] + dup v2.4s, v0.s[0] + dup v3.4s, v0.s[0] + + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + {% amx ldx x1 %} // load 16 values + + mov x2, {{ 0|setting:28 }} // Z-=X + {% amx fms32 x2 %} + add x2, x2, {{0|setting:20}} // next Z row + {% amx fms32 x2 %} + b .non_linear_loop + +.scalar_mul: + ldr w5, [x0, #8] + fmov s0, w5 + dup v0.4s, v0.s[0] + dup v1.4s, v0.s[0] + dup v2.4s, v0.s[0] + dup v3.4s, v0.s[0] + + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + {% amx ldy x1 %} // load 16 values + + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:27 }} // Z=X*Y + + {% amx extrx x2 %} + {% amx fma32 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + {% amx extrx x2 %} + {% amx fma32 x4 %} + + b .non_linear_loop + +.scalar_min: + mov x2, 5 + b .scalar_min_max +.scalar_max: + mov x2, 7 +.scalar_min_max: + ldr w5, [x0, #8] + fmov s0, w5 + dup v0.4s, v0.s[0] + dup v1.4s, v0.s[0] + dup v2.4s, v0.s[0] + dup v3.4s, v0.s[0] + + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + {% amx ldx x1 %} // load 16 values + + lsl x2, x2, 47 + orr x2, x2, {{ 0|setting:44 }} // f32 + + {% amx vecfp x2 %} + add x2, x2, {{ 0|setting:20}} // next Z + {% amx vecfp x2 %} + + b .non_linear_loop + +.add_unicast: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + {% for neon in (0..7) %} + {% for lane in (0..3) %} + ld1 { v{{neon}}.s }[{{lane}}], [x5], x6 + {% endfor %} + {% endfor %} + mov x8, x1 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x8], #64 + st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x8], #64 + + orr x8, x1, {{ 0|setting:62 }} // pair + {% amx ldy x8 %} + + eor x2, x2, x2 + orr x2, x2, {{ 0|setting:63 }} // vector mode + orr x2, x2, {{ 0|setting:29 }} // perform Z0+=Y0 + {% amx fma32 x2 %} + orr x2, x2, {{ 0|setting:20 }} // Z1 + orr x2, x2, 64 // offset Y by 16 values + {% amx fma32 x2 %} + + b .non_linear_loop + +.add_row_col_products: + ldp x5, x6, [x0, #8] // a base ptr, b base ptr + + ld1 { v0.s }[0], [x6] + st1 { v0.s }[0], [x1] + {% amx ldy x1 %} + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x5], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x5] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // load a pair + {% amx ldx x2 %} + + mov x2, {{ 0|setting:44 }} // f32 + orr x2, x2, {{ 0|setting:38 }} // Broadcast Y + {% amx vecfp x2 %} + + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + orr x2, x2, {{ 0|setting:16 }} // X offset + {% amx vecfp x2 %} + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + ands x8, x5, 0x7f + bne .store_generic + cmp x6, 4 + bne .store_generic + cmp x7, 4 + bne .store_generic + + orr x5, x5, {{ 0|setting:62 }} // pair + {% amx stz x5 %} + b .non_linear_loop + + .store_generic: + + orr x8, x1, {{ 0|setting:62 }} // pair + {% amx stz x8 %} + + mov x8, x1 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x8], #64 + ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x8], #64 + {% for neon in (0..7) %} + {% for lane in (0..3) %} + st1 { v{{neon}}.s }[{{lane}}], [x5], x6 + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.load_tile: + ldr x2, [x0, #16] // row major ptr + orr x2, x2, {{0|setting:62}} // load pairs + {% amx ldz x2 %} + b .non_linear_loop + +.return: +{{ AMX_CLR }} +ret diff --git a/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x32.tmpl b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x32.tmpl new file mode 100644 index 000000000..940efc74c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x32.tmpl @@ -0,0 +1,764 @@ +// vim: ft=arm +.text +.align 4 + +/* Z: 32x32 + z0[0] .. z0[15] z1[0] .. z1[15] + z4[0] .. z4[15] z5[0] .. z5[15] + .. +z60[0] .. z60[15] z61[0] .. z61[15] + + z2[0] .. z2[15] z3[0] .. z3[15] + z5[0] .. z5[15] z6[0] .. z6[15] + .. +z62[0] .. z62[15] z63[0] .. z63[15] +*/ + + +.global {{G}}apple_amx_mmm_f32_32x32_{{suffix}} +{{G}}apple_amx_mmm_f32_32x32_{{suffix}}: + +{{ AMX_SET }} + + // set x1 to a 128 bytes aligned block for loads + mov x1, sp + lsr x1, x1, #7 + lsl x1, x1, #7 + sub x1, x1, 128 + +{% include "dispatcher.tmpliq" %} + +.leaky_relu: +.q_scale: +.q_shl: +.q_shr: + b .unsupported + +.add_mat_mul: + + ldr x2, [x0, #24] // b + ldp x3, x4, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + + orr x4, x4, {{0|setting:62}} // load pairs (A) + orr x2, x2, {{0|setting:62}} // load pairs (B) + + eor x5, x5, x5 // top left + + orr x6, x5, {{ 0|setting:20 }} // Z row = 1 + orr x6, x6, {{ 0|setting:16 }} // top right + + orr x7, x5, {{ 0|setting:21 }} + orr x7, x7, {{ 0|setting:6 }} // bottom left + + orr x8, x7, x6 // bottom right + + .packed_packed_loop_1: + {% amx ldx x2 %} + {% amx ldy x4 %} + add x2, x2, 128 + add x4, x4, 128 + + {% amx fma32 x5 %} + {% amx fma32 x6 %} + {% amx fma32 x7 %} + {% amx fma32 x8 %} + + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +.clear: + // top left + eor x2, x2, x2 + orr x2, x2, {{ 0|setting:27 }} + orr x2, x2, {{ 0|setting:28 }} + orr x2, x2, {{ 0|setting:29 }} // Z = 0 + {% amx fma32 x2 %} + + // top right + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + {% amx fma32 x2 %} + + // bottom right + orr x2, x2, {{ 0|setting:21 }} // Z row = 3 + {% amx fma32 x2 %} + + // bottom left + eor x2, x2, {{ 0|setting:20 }} // Z row = 2 + {% amx fma32 x2 %} + + b .non_linear_loop + +.per_col_sub: + + // performs a unary neg on Z + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:28 }} + orr x4, x4, {{ 0|setting:27 }} // Z=-X + + mov x6, 64 + .per_col_sub_loop: + {% amx extrx x2 %} + {% amx fms32 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + subs x6, x6, 1 + bne .per_col_sub_loop + + // continue + +.per_col_add: + ldr x2, [x0, #8] + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x1, x1, {{ 0|setting:62 }} // load a pair + {% amx ldx x1 %} + + mov x2, {{ 0|setting:28 }} // z += y + + // top left + {% amx fma32 x2 %} + + // bottom left + orr x2, x2, {{ 0|setting:21 }} // Z row = 2 + {% amx fma32 x2 %} + + // bottom right + orr x2, x2, {{ 0|setting:16 }} // X offset + orr x2, x2, {{ 0|setting:20 }} // Z row = 3 + {% amx fma32 x2 %} + + // top right + eor x2, x2, {{ 0|setting:21 }} // Z row = 1 + {% amx fma32 x2 %} + + b .non_linear_loop + +.per_col_sub_flipped: + ldr x2, [x0, #8] + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x1, x1, {{ 0|setting:62 }} // load a pair + {% amx ldx x1 %} + + mov x2, {{ 0|setting:28 }} // z += y + + // top left + {% amx fms32 x2 %} + + // bottom left + orr x2, x2, {{ 0|setting:21 }} // Z row = 2 + {% amx fms32 x2 %} + + // bottom right + orr x2, x2, {{ 0|setting:16 }} // X offset + orr x2, x2, {{ 0|setting:20 }} // Z row = 3 + {% amx fms32 x2 %} + + // top right + eor x2, x2, {{ 0|setting:21 }} // Z row = 1 + {% amx fms32 x2 %} + + + b .non_linear_loop + +.per_row_sub_flipped: + ldr x2, [x0, #8] + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // load a pair + {% amx ldy x2 %} + + mov x2, {{ 0|setting:29 }} // z += y + + // top left + {% amx fms32 x2 %} + + // top right + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + {% amx fms32 x2 %} + + // bottom right + orr x2, x2, {{ 0|setting:21 }} // Z row = 3 + orr x2, x2, {{ 0|setting:6 }} // Y offset + {% amx fms32 x2 %} + + // bottom left + eor x2, x2, {{ 0|setting:20 }} // Z row = 2 + {% amx fms32 x2 %} + + b .non_linear_loop + +.per_row_sub: + // performs a unary neg on Z + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:28 }} + orr x4, x4, {{ 0|setting:27 }} // Z=-X + + mov x6, 64 + .per_row_sub_loop: + {% amx extrx x2 %} + {% amx fms32 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + subs x6, x6, 1 + bne .per_row_sub_loop + + // continue + +.per_row_add: + ldr x2, [x0, #8] + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // load a pair + {% amx ldy x2 %} + + mov x2, {{ 0|setting:29 }} // z += y + + // top left + {% amx fma32 x2 %} + + // top right + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + {% amx fma32 x2 %} + + // bottom right + orr x2, x2, {{ 0|setting:21 }} // Z row = 3 + orr x2, x2, {{ 0|setting:6 }} // Y offset + {% amx fma32 x2 %} + + // bottom left + eor x2, x2, {{ 0|setting:20 }} // Z row = 2 + {% amx fma32 x2 %} + + b .non_linear_loop + +.per_row_min: + mov x2, 5 + b .per_row_min_max +.per_row_max: + mov x2, 7 +.per_row_min_max: + ldr x5, [x0, #8] + + add x6, x5, 64 + + lsl x2, x2, 47 // max(x,z) (or min) + orr x2, x2, {{ 0|setting:44 }} // f32 + orr x3, x2, {{ 0|setting:20 }} // right half: z offset + + orr x8, x2, {{ 0|setting:21 }} // bottom left + orr x9, x3, {{ 0|setting:21 }} // bottom right + + mov x4, 16 + .loop_per_row_max: + // top half + ld1 { v0.s }[0], [x5], #4 + dup v0.4s, v0.s[0] + dup v1.4s, v0.s[0] + dup v2.4s, v0.s[0] + dup v3.4s, v0.s[0] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + + {% amx ldx x1 %} + {% amx vecfp x2 %} + {% amx vecfp x3 %} + + add x2, x2, {{ 0|setting:22 }} + add x3, x3, {{ 0|setting:22 }} + + // bottom half + ld1 { v0.s }[0], [x6], #4 + dup v0.4s, v0.s[0] + dup v1.4s, v0.s[0] + dup v2.4s, v0.s[0] + dup v3.4s, v0.s[0] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + + {% amx ldx x1 %} + {% amx vecfp x8 %} + {% amx vecfp x9 %} + + add x8, x8, {{ 0|setting:22 }} + add x9, x9, {{ 0|setting:22 }} + + subs x4, x4, 1 + bne .loop_per_row_max + + b .non_linear_loop + +.per_col_min: + mov x2, 5 + b .per_col_min_max +.per_col_max: + mov x2, 7 +.per_col_min_max: + ldr x4, [x0, #8] + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x4], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x4] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x3, x1, {{ 0|setting:62 }} // load a pair + {% amx ldx x3 %} + + lsl x2, x2, 47 // max(x,z) (or min) + orr x2, x2, {{ 0|setting:44 }} // f32 + + orr x3, x2, {{ 0|setting:16 }} // right half: x offset + orr x3, x3, {{ 0|setting:20 }} // right half: z offset + + mov x4, 32 + .loop_per_col_max: + {% amx vecfp x2 %} + {% amx vecfp x3 %} + add x2, x2, {{ 0|setting:21 }} + add x3, x3, {{ 0|setting:21 }} + subs x4, x4, 1 + bne .loop_per_col_max + + b .non_linear_loop + +.per_col_mul: + ldr x4, [x0, #8] + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x4], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x4] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // load a pair + {% amx ldy x2 %} + + eor x2, x2, x2 // X[0] = Z[0] + + eor x3, x3, x3 + orr x3, x3, {{0|setting:20 }} // Z[1] + orr x3, x3, {{0|setting:16 }} // X[1] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:27 }} // Z=X*Y + + mov x5, {{ 0|setting:63 }} // vector mode + orr x5, x5, {{ 0|setting:27 }} // Z=X*Y + orr x5, x5, {{ 0|setting:20 }} // Z right + orr x5, x5, {{ 0|setting:16 }} // X[1] (right) + orr x5, x5, {{ 0|setting:6 }} // Y[1] (right) + + mov x6, 32 + .loop_per_col_mul: + {% amx extrx x2 %} + {% amx extrx x3 %} + {% amx fma32 x4 %} + {% amx fma32 x5 %} + add x2, x2, {{0|setting:21}} + add x3, x3, {{0|setting:21}} + add x4, x4, {{0|setting:21}} + add x5, x5, {{0|setting:21}} + subs x6, x6, 1 + bne .loop_per_col_mul + + b .non_linear_loop + +.per_row_mul: + ldr x14, [x0, #8] + add x15, x14, 64 + + // extrx + eor x2, x2, x2 // X[0] = Z[0] (top left) + + eor x3, x3, x3 + orr x3, x3, {{0|setting:20 }} // Z[1] + orr x3, x3, {{0|setting:16 }} // X[1] = Z[1] (top right) + + eor x4, x4, x4 + orr x4, x4, {{0|setting:21}} // X[0] = Z[2] (bottom left) + + orr x5, x4, {{0|setting:20}} + orr x5, x5, {{0|setting:16}} // X[1] = Z[3] (bottom right) + + // fma32 + eor x6, x6, x6 + orr x6, x6, {{0|setting:63}} // vector mode + orr x6, x6, {{0|setting:27}} // Z=X*Y Z[0]=X[0]*Y[0] + + orr x7, x6, {{0|setting:20}} // Z[1] + orr x7, x7, {{0|setting:16}} // X[1] Z[1] = X[1]*Y[0] + + orr x8, x6, {{0|setting:21}} // Z[2] + orr x8, x8, {{0|setting:21}} // Z[2] + + orr x9, x8, {{0|setting:20}} // Z[3] + orr x9, x9, {{0|setting:16}} // X[1] + + mov x10, 16 + .loop_per_row_mul: + // top + ld1 { v0.s }[0], [x14], #4 + dup v0.4s, v0.s[0] + dup v1.4s, v0.s[0] + dup v2.4s, v0.s[0] + dup v3.4s, v0.s[0] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + + {% amx ldy x1 %} + {% amx extrx x2 %} + {% amx extrx x3 %} + {% amx fma32 x6 %} + {% amx fma32 x7 %} + + add x2, x2, {{ 0|setting:22 }} + add x3, x3, {{ 0|setting:22 }} + add x6, x6, {{ 0|setting:22 }} + add x7, x7, {{ 0|setting:22 }} + + // bottom + ld1 { v0.s }[0], [x15], #4 + dup v0.4s, v0.s[0] + dup v1.4s, v0.s[0] + dup v2.4s, v0.s[0] + dup v3.4s, v0.s[0] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + + {% amx ldy x1 %} + {% amx extrx x4 %} + {% amx extrx x5 %} + {% amx fma32 x8 %} + {% amx fma32 x9 %} + + add x4, x4, {{ 0|setting:22 }} + add x5, x5, {{ 0|setting:22 }} + add x8, x8, {{ 0|setting:22 }} + add x9, x9, {{ 0|setting:22 }} + + subs x10, x10, 1 + bne .loop_per_row_mul + + b .non_linear_loop + +.scalar_sub: + // performs a unary neg on Z, then go to scalar_add + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:28 }} + orr x4, x4, {{ 0|setting:27 }} // Z=-X + + mov x6, 64 + .scalar_sub_loop: + {% amx extrx x2 %} + {% amx fms32 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + subs x6, x6, 1 + bne .scalar_sub_loop + + // continue on purpose + +.scalar_add: + ldr w5, [x0, #8] + + fmov s0, w5 + dup v0.4s, v0.s[0] + dup v1.4s, v0.s[0] + dup v2.4s, v0.s[0] + dup v3.4s, v0.s[0] + + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + {% amx ldx x1 %} // load 16 values + + mov x2, {{ 0|setting:28 }} // Z+=X + {% for chunk in (0..3) %} + {% amx fma32 x2 %} + add x2, x2, {{0|setting:20}} // next Z row + {% endfor %} + b .non_linear_loop + +.scalar_sub_flipped: + ldr w5, [x0, #8] + fmov s0, w5 + dup v0.4s, v0.s[0] + dup v1.4s, v0.s[0] + dup v2.4s, v0.s[0] + dup v3.4s, v0.s[0] + + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + {% amx ldx x1 %} // load 16 values + + mov x2, {{ 0|setting:28 }} // Z-=X + {% for chunk in (0..3) %} + {% amx fms32 x2 %} + add x2, x2, {{0|setting:20}} // next Z row + {% endfor %} + b .non_linear_loop + +.scalar_mul: + ldr w5, [x0, #8] + fmov s0, w5 + dup v0.4s, v0.s[0] + dup v1.4s, v0.s[0] + dup v2.4s, v0.s[0] + dup v3.4s, v0.s[0] + + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + {% amx ldy x1 %} // load 16 values + + eor x2, x2, x2 // X[0] = Z[0] + + mov x4, {{ 0|setting:63 }} // vector mode + orr x4, x4, {{ 0|setting:27 }} // Z=X*Y + + mov x6, 64 + .scalar_mul_loop: + {% amx extrx x2 %} + {% amx fma32 x4 %} + add x2, x2, {{0|setting:20}} // next Z row + add x4, x4, {{0|setting:20}} // next Z row + subs x6, x6, 1 + bne .scalar_mul_loop + + b .non_linear_loop + +.scalar_min: + mov x2, 5 + b .scalar_min_max +.scalar_max: + mov x2, 7 +.scalar_min_max: + ldr w5, [x0, #8] + fmov s0, w5 + dup v0.4s, v0.s[0] + dup v1.4s, v0.s[0] + dup v2.4s, v0.s[0] + dup v3.4s, v0.s[0] + + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + {% amx ldx x1 %} // load 16 values + + lsl x2, x2, 47 + orr x2, x2, {{ 0|setting:44 }} // f32 + + mov x3, 64 + .loop_scalar_max: + add x2, x2, {{ 0|setting:20}} // next Z + {% amx vecfp x2 %} + subs x3, x3, 1 + bne .loop_scalar_max + + b .non_linear_loop + +.add_unicast: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + add x8, x1, 64 + + mov x3, 0 // x3 is the row + .loop_load: + and x9, x3, 0xf // x9 = row % 16 + lsl x9, x9, 2 // x9 = (row % 16) * 4 + lsr x10, x3, 4 // x10 = row / 16 + lsl x10, x10, 1 // x10 = (row / 16) * 2 + add x9, x9, x10 // x9 = x9 + x10 + + mov x4, x5 + {% for neon in (0..3) %} + {% for lane in (0..3) %} + ld1 { v{{neon}}.s }[{{lane}}], [x4], x7 + {% endfor %} + {% endfor %} + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + {% for neon in (0..3) %} + {% for lane in (0..3) %} + ld1 { v{{neon}}.s }[{{lane}}], [x4], x7 + {% endfor %} + {% endfor %} + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x8] + + mov x2, x1 + orr x2, x2, {{ 0|setting:62 }} // load 32 values + {% amx ldy x2 %} + + lsl x2, x9, 20 // left Z register to update + orr x2, x2, {{ 0|setting:63 }} // vector mode + orr x2, x2, {{ 0|setting:29 }} // perform Z+=Y + {% amx fma32 x2 %} + + add x2, x2, {{0|setting:20}} + orr x2, x2, 64 // offset Y by 16 values + {% amx fma32 x2 %} + + add x5, x5, x6 + add x3, x3, 1 + cmp x3, 32 + bne .loop_load + + b .non_linear_loop + +.add_row_col_products: + ldp x5, x6, [x0, #8] // a base ptr, b base ptr + + add x8, x1, 64 + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x5], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x5] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // load a pair + {% amx ldy x2 %} + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6], #64 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6] + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + sub x1, x1, #64 + + orr x2, x1, {{ 0|setting:62 }} // load a pair + {% amx ldx x2 %} + + // top left + eor x2, x2, x2 + {% amx fma32 x2 %} + + // top right + orr x2, x2, {{ 0|setting:20 }} // Z row = 1 + orr x2, x2, {{ 0|setting:16 }} // X offset + {% amx fma32 x2 %} + + // bottom right + orr x2, x2, {{ 0|setting:21 }} // Z row = 3 + orr x2, x2, {{ 0|setting:6 }} // Y offset + {% amx fma32 x2 %} + + // bottom left + eor x2, x2, {{ 0|setting:20 }} // Z row = 2 + eor x2, x2, {{ 0|setting:16 }} // X offset <- + {% amx fma32 x2 %} + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + cmp x7, 4 + bne .store_generic + ands x8, x5, 0x7f + bne .store_generic + ands x8, x6, 0x7f + bne .store_generic + + orr x5, x5, {{ 0|setting:62 }} // pair + lsl x8, x6, 4 + add x8, x8, x5 // x8 = 16*rsc + orr x8, x8, {{ 0|setting:57 }} // first to x8 is z2 + + mov x4, {{0|setting:58}} // Zreg += 4 + add x4, x4, x6 // +rsc + + mov x3, 16 + .loop_store_direct: + {% amx stz x5 %} + {% amx stz x8 %} + add x5, x5, x4 + add x8, x8, x4 + subs x3, x3, 1 + bne .loop_store_direct + + b .non_linear_loop + +.store_generic: + + add x8, x1, 64 + + mov x3, 0 // row id + .loop_store: + and x9, x3, 0xf // x9 = row % 16 + lsl x9, x9, 2 // x9 = (row % 16) * 4 + lsr x10, x3, 4 // x10 = row / 16 + lsl x10, x10, 1 // x10 = (row / 16) * 2 + add x9, x9, x10 // x9 = x9 + x10 + + lsl x2, x9, 56 + orr x2, x2, {{ 0|setting:62 }} + orr x2, x2, x1 + {% amx stz x2 %} + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1] + + mov x4, x5 + {% for neon in (0..3) %} + {% for lane in (0..3) %} + st1 { v{{neon}}.s }[{{lane}}], [x4], x7 + {% endfor %} + {% endfor %} + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x8] + {% for neon in (0..3) %} + {% for lane in (0..3) %} + st1 { v{{neon}}.s }[{{lane}}], [x4], x7 + {% endfor %} + {% endfor %} + add x5, x5, x6 + + add x3, x3, 1 + cmp x3, 32 + bne .loop_store + + b .non_linear_loop + +.load_tile: + ldr x2, [x0, #16] // row major ptr + orr x2, x2, {{0|setting:62}} // load pairs + mov x3, x2 + orr x3, x3, {{0|setting:57}} + add x3, x3, #2048 + + mov x4, {{0|setting:58}} // z+=4 + add x4, x4, #128 + + mov x8, 16 + .loop_load_tile: + {% amx ldz x2 %} + {% amx ldz x3 %} + add x2, x2, x4 + add x3, x3, x4 + subs x8, x8, 1 + bne .loop_load_tile + + b .non_linear_loop + +.return: +{{ AMX_CLR }} +ret diff --git a/vendor/tract-linalg-0.22.1/arm64/apple_amx/dispatcher.tmpliq b/vendor/tract-linalg-0.22.1/arm64/apple_amx/dispatcher.tmpliq new file mode 100644 index 000000000..150db4683 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/apple_amx/dispatcher.tmpliq @@ -0,0 +1,37 @@ +// vim: ft=arm + +.non_linear: + sub x0, x0, 40 + +.non_linear_loop: + add x0, x0, 40 + ldr x2, [x0] + + mov x4, #{{ jump_table | size }} + + cmp x2, #{{ jump_table | size }} + csel x2, x2, x4, lt + cmp x2, #0 + csel x2, x4, x2, lt + + adr x3, .jmp_table + add x3, x3, x2, LSL#2 + br x3 + +.jmp_table: +{% for j in jump_table %} + b .{{j}} +{% endfor %} + b .unsupported + + add x0, x2, #4000 + b .return + +.unsupported: + mov x0, #1 + b .return + +.done: + mov x0, 0 + b .return + diff --git a/vendor/tract-linalg-0.22.1/arm64/apple_amx/instructions.rs b/vendor/tract-linalg-0.22.1/arm64/apple_amx/instructions.rs new file mode 100644 index 000000000..5912b742c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/apple_amx/instructions.rs @@ -0,0 +1,191 @@ +use liquid::model::KString; +use liquid::partials::PartialCompiler; +use liquid::{ParserBuilder, ValueView}; +use liquid_core::{ + Display_filter, Expression, Filter, FilterParameters, FilterReflection, FromFilterParameters, + ParseFilter, ParseTag, Renderable, Runtime, TagReflection, Value, +}; + +pub fn register(parser: ParserBuilder) -> ParserBuilder { + parser.tag(AmxTag).filter(LeftShift).filter(Setting).filter(Unsigned) +} + +pub fn globals() -> Vec<(KString, Value)> { + vec![ + ("AMX_SET".to_string().into(), Value::scalar(amx_nop_op_imm5(17, 0))), + ("AMX_CLR".to_string().into(), Value::scalar(amx_nop_op_imm5(17, 1))), + ] +} + +fn amx_nop_op_imm5(op: usize, imm5: usize) -> String { + format!("nop\nnop\nnop\n.word 0x{:x}\n", (0x201000 + (op << 5) + imm5)) +} + +fn amx_nop_op_gpr(op: usize, gpr: usize) -> String { + format!(".word 0x{:x}", (0x201000 + (op << 5) + gpr)) +} + +#[derive(Copy, Clone)] +struct AmxTag; + +impl ParseTag for AmxTag { + fn reflection(&self) -> &dyn liquid_core::TagReflection { + self + } + + fn parse( + &self, + mut arguments: liquid_core::TagTokenIter, + _options: &liquid_core::Language, + ) -> liquid_core::Result> { + let op = arguments.expect_next("expects op and gpr")?.as_str().to_string(); + let gpr = arguments + .expect_next("expects op and gpr")? + .as_str() + .trim_start_matches('x') + .parse::() + .unwrap(); + let op_id = [ + "ldx", "ldy", "stx", "sty", "ldz", "stz", "ldzi", "stzi", "extrx", "extry", "fma64", + "fms64", "fma32", "fms32", "mac16", "fma16", "fms16", "setclr", "vecint", "vecfp", + "matint", "matfp", "genlut", + ] + .iter() + .position(|x| x == &op) + .unwrap(); + Ok(Box::new(RenderedAmxTag(format!( + "{} \t\t\t\t// AMX {op} x{gpr}\n", + amx_nop_op_gpr(op_id, gpr) + )))) + } +} + +impl TagReflection for AmxTag { + fn tag(&self) -> &str { + "amx" + } + + fn description(&self) -> &str { + "translate to an Apple AMX instruction" + } +} + +#[derive(Clone, Debug)] +struct RenderedAmxTag(String); + +impl Renderable for RenderedAmxTag { + fn render_to( + &self, + writer: &mut dyn std::io::Write, + _runtime: &dyn liquid_core::Runtime, + ) -> liquid_core::Result<()> { + writer.write_all(self.0.as_bytes()).unwrap(); + Ok(()) + } +} + +#[derive(Debug, FilterParameters)] +struct ShiftArgs { + #[parameter(description = "The number to shift the input by.")] + operand: Expression, +} + +#[derive(Clone, ParseFilter, FilterReflection)] +#[filter( + name = "lsl", + description = "Shift left a number by the given operand.", + parameters(ShiftArgs), + parsed(LeftShiftFilter) +)] +struct LeftShift; + +#[derive(Debug, FromFilterParameters, Display_filter)] +#[name = "lsl"] +struct LeftShiftFilter { + #[parameters] + args: ShiftArgs, +} + +impl Filter for LeftShiftFilter { + fn evaluate(&self, input: &dyn ValueView, runtime: &dyn Runtime) -> liquid_core::Result { + let args = self.args.evaluate(runtime)?; + + let operand = args + .operand + .as_scalar() + .ok_or_else(|| invalid_argument("operand", "Number expected"))?; + + let result = input + .as_scalar() + .unwrap() + .to_integer() + .and_then(|i| operand.to_integer().map(|o| Value::scalar(i << o))) + .ok_or_else(|| invalid_argument("operand", "Integer expected"))?; + + Ok(result) + } +} + +#[derive(Clone, ParseFilter, FilterReflection)] +#[filter( + name = "setting", + description = "Set the bit deigned by the operand.", + parameters(ShiftArgs), + parsed(SettingFilter) +)] +struct Setting; + +#[derive(Debug, FromFilterParameters, Display_filter)] +#[name = "setting"] +struct SettingFilter { + #[parameters] + args: ShiftArgs, +} + +impl Filter for SettingFilter { + fn evaluate(&self, input: &dyn ValueView, runtime: &dyn Runtime) -> liquid_core::Result { + let args = self.args.evaluate(runtime)?; + + let operand = args + .operand + .as_scalar() + .ok_or_else(|| invalid_argument("operand", "Number expected"))?; + + let result = input + .as_scalar() + .unwrap() + .to_integer() + .and_then(|i| operand.to_integer().map(|o| Value::scalar(i | (1 << o)))) + .ok_or_else(|| invalid_argument("operand", "Integer expected"))?; + + Ok(result) + } +} + +fn invalid_argument(argument: S, cause: S) -> liquid::Error +where + S: Into, +{ + liquid_core::Error::with_msg("Invalid argument") + .context("argument", argument) + .context("cause", cause) +} + +#[derive(Clone, ParseFilter, FilterReflection)] +#[filter(name = "u", description = "unsigned number", parsed(UnsignedFilter))] +pub struct Unsigned; + +#[derive(Debug, Default, Display_filter)] +#[name = "float16"] +struct UnsignedFilter; + +impl Filter for UnsignedFilter { + fn evaluate( + &self, + input: &dyn ValueView, + _runtime: &dyn Runtime, + ) -> liquid_core::Result { + let input = input.as_scalar().unwrap().to_integer().unwrap() as u64; + Ok(input.to_string().to_value()) + } +} diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_leaky_relu_f16_8n.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_leaky_relu_f16_8n.tmpl new file mode 100644 index 000000000..b0d13dc34 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_leaky_relu_f16_8n.tmpl @@ -0,0 +1,71 @@ +// vim: ft=arm + +// no preservation either for v0-v7 and v16-v31 + +.text +.align 4 + +{% if needs_pragma == true %} +.cpu generic+fp+simd+fp16 +{% endif %} +.global {{G}}arm64fp16_leaky_relu_f16_8n_{{suffix}} +{{G}}arm64fp16_leaky_relu_f16_8n_{{suffix}}: + + cmp x1, #0 + beq .return + + mov v31.h[0], w2 + dup v31.8h, v31.h[0] + mov x2, x0 + + cmp x1, #64 + blt .loop + + ld1 { v16.8h, v17.8h, v18.8h, v19.8h }, [x2], #64 +.loop4: + + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x2], #64 + + fmul v20.8h, v16.8h, v31.8h + fmul v21.8h, v17.8h, v31.8h + fmul v22.8h, v18.8h, v31.8h + fmul v23.8h, v19.8h, v31.8h + + fcmge v24.8h, v16.8h, #0.0 + fcmge v25.8h, v17.8h, #0.0 + fcmge v26.8h, v18.8h, #0.0 + fcmge v27.8h, v19.8h, #0.0 + + bsl v24.16b, v16.16b, v20.16b + bsl v25.16b, v17.16b, v21.16b + bsl v26.16b, v18.16b, v22.16b + bsl v27.16b, v19.16b, v23.16b + + st1 { v24.8h, v25.8h, v26.8h, v27.8h }, [x0], #64 + + and v16.16b, v0.16b, v0.16b + and v17.16b, v1.16b, v1.16b + and v18.16b, v2.16b, v2.16b + and v19.16b, v3.16b, v3.16b + + subs x1, x1, #32 + cmp x1, #64 + bge .loop4 + + cmp x1, #0 + beq .return + +.loop: + ld1 { v16.8h }, [x0] + + fmul v17.8h, v16.8h, v31.8h + fcmge v18.8h, v16.8h, #0.0 + bsl v18.16b, v16.16b, v17.16b + + st1 { v18.8h }, [x0], #16 + + subs x1, x1, #8 + bne .loop + +.return: + ret diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_col.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_col.tmpliq new file mode 100644 index 000000000..6bddb35f9 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_col.tmpliq @@ -0,0 +1,41 @@ +// vim: ft=arm + +.{{label}}: + ldr x2, [x0, #8] + +{% capture mr_over_8 %}{{ mr | divided_by: 8}}{%endcapture%} +{% capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_8}}{%endcapture%} + +{% capture loads %}{{cols | divided_by: 8}}{% endcapture %} + +{%if cols == "1" %} + ld1 {v0.h}[0], [ x2 ] +{% elsif cols == "3" %} + ld1 {v0.s}[0], [ x2 ], #4 + ld1 {v0.h}[2], [ x2 ] +{% elsif cols == "4" %} + ldr d0, [ x2 ] +{% elsif cols == "6" %} + ld1 {v0.d}[0], [ x2 ], #8 + ld1 {v0.s}[2], [ x2 ] +{% else %} + {% for reg in (1..loads) %} + ldr q{{reg |minus:1}}, [ x2 ], #16 + {% endfor %} +{% endif %} + +// mr:{{mr}} {{ loads }} {{cols}} + +{% for col in (1..cols) %} + dup v3.8h, v{{col| minus: 1|divided_by:8}}.h[{{col| minus: 1|modulo:8}}] + {% for row in (1..mr_over_8) %} + {% capture acc %}{{ col|minus:1|times:mr_over_8|plus:row|minus:1|plus:from }}{% endcapture %} + {% if flipped %} + {{op}} v{{acc}}.8h, v{{acc}}.8h, v3.8h + {% else %} + {{op}} v{{acc}}.8h, v3.8h, v{{acc}}.8h + {% endif %} + {% endfor %} +{% endfor %} + +b .non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_row.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_row.tmpliq new file mode 100644 index 000000000..f756344a7 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_row.tmpliq @@ -0,0 +1,25 @@ +// vim: ft=arm + +.{{label}}: + ldr x2, [x0, #8] + +{% capture mr_over_8 %}{{ mr | divided_by: 8 }}{%endcapture%} +{% capture mr_over_8_min_1 %}{{ mr | divided_by: 8 | minus: 1 }}{%endcapture%} + +{% for reg in (0..mr_over_8_min_1) %} + ldr q{{reg}}, [ x2 ], #16 +{% endfor %} + +{% if flipped %} + {% for acc in (from..to) %} + {% capture other%}{{acc | minus: from | modulo: mr_over_8}}{%endcapture%} + {{op}} v{{acc}}.8h, v{{acc}}.8h, v{{other}}.8h + {% endfor %} +{% else %} + {% for acc in (from..to) %} + {% capture other%}{{acc | minus: from | modulo: mr_over_8}}{%endcapture%} + {{op}} v{{acc}}.8h, v{{other}}.8h, v{{acc}}.8h + {% endfor %} +{% endif %} + +b .non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_scalar.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_scalar.tmpliq new file mode 100644 index 000000000..1916c2698 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_scalar.tmpliq @@ -0,0 +1,18 @@ +// vim: ft=arm + +.{{label}}: + add x2, x0, #8 + ld1 {v0.h}[0], [ x2 ] + dup v0.8h, v0.h[0] + {% if flipped %} + {% for reg in (from..to) %} + {{op}} v{{reg}}.8h, v{{reg}}.8h, v0.8h + {% endfor %} + {% else %} + {% for reg in (from..to) %} + {{op}} v{{reg}}.8h, v0.8h, v{{reg}}.8h + {% endfor %} + {% endif %} + + b .non_linear_loop + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/cortex_a53.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/cortex_a53.tmpli new file mode 100644 index 000000000..410816dff --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/cortex_a53.tmpli @@ -0,0 +1,65 @@ + fmla v16.4s, v0.4s, v8.s[0] + ldr x5, [x1, #128] + fmla v17.4s, v1.4s, v8.s[0] + ldr x6, [x1, #136] + fmla v18.4s, v2.4s, v8.s[0] + ldr x7, [x1, #144] + fmla v19.4s, v3.4s, v8.s[0] + ldr x9, [x1, #152] + ld1 {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [ x1 ], #64 + + fmla v20.4s, v4.4s, v8.s[0] + ldr x10, [x1, #96] + fmla v21.4s, v5.4s, v8.s[0] + ldr x11, [x1, #104] + fmla v22.4s, v6.4s, v8.s[0] + ldr x12, [x1, #112] + fmla v23.4s, v7.4s, v8.s[0] + ldr x13, [x1, #120] + + ld1 {{ v4.4s, v5.4s, v6.4s, v7.4s }}, [ x1 ] + + fmla v24.4s, v0.4s, v8.s[0] + ldr x14, [x1, #128] + fmla v25.4s, v1.4s, v8.s[0] + ldr x15, [x1, #136] + fmla v26.4s, v2.4s, v8.s[0] + ldr x20, [x1, #144] + fmla v27.4s, v3.4s, v8.s[0] + ldr x21, [x1, #152] + fmla v28.4s, v4.4s, v8.s[0] + ldr x22, [x1, #160] + fmla v29.4s, v5.4s, v8.s[0] + ldr x23, [x1, #168] + fmla v30.4s, v6.4s, v8.s[0] + ldr x24, [x1, #176] + fmla v31.4s, v7.4s, v8.s[0] + ldr x25, [x1, #184] + + ld1 {{ v8.s }}[0], [ x2 ], #4 + + prfm pldl1keep, [x1, #1024] + prfm pldl1keep, [x1, #1088] + prfm pldl1keep, [x1, #1152] + prfm pldl1keep, [x1, #1216] + prfm pldl1keep, [x2, #256] + + ins v0.d[0], x5 + ins v1.d[0], x7 + ins v2.d[0], x10 + ins v3.d[0], x12 + ins v4.d[0], x14 + ins v5.d[0], x20 + ins v6.d[0], x22 + ins v7.d[0], x24 + + ins v0.d[1], x6 + ins v1.d[1], x9 + ins v2.d[1], x11 + ins v3.d[1], x13 + ins v4.d[1], x15 + ins v5.d[1], x21 + ins v6.d[1], x23 + ins v7.d[1], x25 + + add x1, x1, #192 diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/naive.tmpli new file mode 100644 index 000000000..367339ef5 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/naive.tmpli @@ -0,0 +1,32 @@ + ld1 {{ v9.8h, v10.8h, v11.8h, v12.8h }}, [x1], #64 + ld1 {{ v13.8h, v14.8h, v15.8h }}, [x1], #48 + + fmla v16.8h, v0.8h, v8.h[0] + fmla v17.8h, v1.8h, v8.h[0] + fmla v18.8h, v2.8h, v8.h[0] + fmla v19.8h, v3.8h, v8.h[0] + fmla v20.8h, v4.8h, v8.h[0] + fmla v21.8h, v5.8h, v8.h[0] + fmla v22.8h, v6.8h, v8.h[0] + fmla v23.8h, v7.8h, v8.h[0] + fmla v24.8h, v9.8h, v8.h[0] + ld1 {{ v9.8h }}, [ x1 ], #16 + ld1 {{ v0.8h, v1.8h, v2.8h, v3.8h }}, [x1], #64 + ld1 {{ v4.8h, v5.8h, v6.8h, v7.8h }}, [x1], #64 + fmla v25.8h, v10.8h, v8.h[0] + fmla v26.8h, v11.8h, v8.h[0] + fmla v27.8h, v12.8h, v8.h[0] + fmla v28.8h, v13.8h, v8.h[0] + fmla v29.8h, v14.8h, v8.h[0] + fmla v30.8h, v15.8h, v8.h[0] + + fmla v31.8h, v9.8h, v8.h[0] + + ld1 {{ v8.h }}[0], [ x2 ], #2 + + prfm pldl1keep, [x1, #1024] + prfm pldl1keep, [x1, #1088] + prfm pldl1keep, [x1, #1152] + prfm pldl1keep, [x1, #1216] + prfm pldl1keep, [x2, #256] + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop2/cortex_a55.tmpli new file mode 100644 index 000000000..821ed3f5c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop2/cortex_a55.tmpli @@ -0,0 +1,85 @@ + ld1 {{ v9.4s, v10.4s, v11.4s }}, [x1], #48 + + fmla v16.8h, v0.8h, v8.h[0] + ldr w8, [x2], #4 + fmla v17.8h, v1.8h, v8.h[0] + ldr d12, [x1], #8 + fmla v18.8h, v2.8h, v8.h[0] + ldr x12, [x1], #8 + fmla v19.8h, v3.8h, v8.h[0] + ldr d13, [x1], #8 + fmla v20.8h, v4.8h, v8.h[0] + ldr x13, [x1], #8 + fmla v21.8h, v5.8h, v8.h[0] + ldr d14, [x1], #8 + fmla v22.8h, v6.8h, v8.h[0] + ldr x14, [x1], #8 + fmla v23.8h, v7.8h, v8.h[0] + ldr d15, [x1], #8 + fmla v24.8h, v9.8h, v8.h[0] + ldr x15, [x1], #8 + + ld1 {{ v0.8h, v1.8h, v2.8h, v3.8h }}, [x1], #64 + ins v8.s[1], w8 + ld1 {{ v4.8h, v5.8h, v6.8h, v7.8h }}, [x1], #64 + + fmla v25.8h, v10.8h, v8.h[0] + ins v12.d[1], x12 + fmla v26.8h, v11.8h, v8.h[0] + ins v13.d[1], x13 + fmla v27.8h, v12.8h, v8.h[0] + ins v14.d[1], x14 + fmla v28.8h, v13.8h, v8.h[0] + ins v15.d[1], x15 + + ld1 {{ v9.8h, v10.8h, v11.8h, v12.8h }}, [x1], #64 + + fmla v29.8h, v14.8h, v8.h[0] + ldr d13, [x1], #8 + fmla v30.8h, v15.8h, v8.h[0] + ldr x13, [x1], #8 + fmla v31.8h, v0.8h, v8.h[0] + ldr d14, [x1], #8 + + fmla v16.8h, v1.8h, v8.h[2] + ldr x14, [x1], #8 + fmla v17.8h, v2.8h, v8.h[2] + ldr d15, [x1], #8 + fmla v18.8h, v3.8h, v8.h[2] + ldr x15, [x1], #8 + fmla v19.8h, v4.8h, v8.h[2] + + ld1 {{ v0.8h }}, [x1], #16 + + fmla v20.8h, v5.8h, v8.h[2] + ldr d1, [x1], #8 + fmla v21.8h, v6.8h, v8.h[2] + ldr x10, [x1], #8 + + fmla v22.8h, v7.8h, v8.h[2] + + fmla v23.8h, v9.8h, v8.h[2] + ins v13.d[1], x13 + fmla v24.8h, v10.8h, v8.h[2] + ins v14.d[1], x14 + fmla v25.8h, v11.8h, v8.h[2] + ins v15.d[1], x15 + + fmla v26.8h, v12.8h, v8.h[2] + prfm pldl1keep, [x1, #1024] + fmla v27.8h, v13.8h, v8.h[2] + ins v1.d[1], x10 + fmla v28.8h, v14.8h, v8.h[2] + prfm pldl1keep, [x1, #1088] + fmla v29.8h, v15.8h, v8.h[2] + prfm pldl1keep, [x1, #1152] + fmla v30.8h, v0.8h, v8.h[2] + prfm pldl1keep, [x1, #1216] + fmla v31.8h, v1.8h, v8.h[2] + prfm pldl1keep, [x2, #256] + + ld1 {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [x1], #64 + ins v8.h[0], v8.h[3] + ld1 {{ v4.4s, v5.4s, v6.4s, v7.4s }}, [x1], #64 + + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1_core.tmpl new file mode 100644 index 000000000..4ba821d4d --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1_core.tmpl @@ -0,0 +1,203 @@ +// vim: ft=arm + +// C tile regs: v16 to v31, no need to preserve + +// no preservation either for v0-v7... +// v8..v15 are callee-preserved +// packed A buffering (2x8 values): alternating v0, v1 with v2, v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +{% if needs_pragma == true %} +.cpu generic+fp+simd+fp16 +{% endif %} +.global {{G}}arm64fp16_mmm_f16_128x1_{{core}}_{{suffix}} +{{G}}arm64fp16_mmm_f16_128x1_{{core}}_{{suffix}}: + + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldr x2, [x0, #24] // b + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + sub x3, x3, #1 + + + ld1 { v8.h }[0], [ x2 ], #2 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64 + ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [ x1 ], #64 + + cmp x3, #0 + beq .packed_packed_loop_1_last + + cmp x3, #4 + blt .packed_packed_loop_1 + +{% capture packed_packed_loop1 %} + {% include "arm64fp16_mmm_f16_128x1/loop1/naive.tmpli" %} +{% endcapture %} + +{% capture packed_packed_loop2 %} + {% include "arm64fp16_mmm_f16_128x1/loop2/cortex_a55.tmpli" %} +{% endcapture %} + +.p2align 4 +.packed_packed_loop_4: + {{ packed_packed_loop2 }} + {{ packed_packed_loop2 }} + + sub x3, x3, #4 + cmp x3, #4 + bge .packed_packed_loop_4 + + cmp x3, #0 + beq .packed_packed_loop_1_last + +.p2align 4 +.packed_packed_loop_1: + {{ packed_packed_loop1 }} + + subs x3, x3, #1 + bne .packed_packed_loop_1 + +// last loop can't read beyond actual input as it's likely not packed and padded +.packed_packed_loop_1_last: + ld1 { v9.8h, v10.8h, v11.8h, v12.8h }, [x1], #64 + ld1 { v13.8h, v14.8h, v15.8h }, [x1], #48 + + fmla v16.8h, v0.8h, v8.h[0] + fmla v17.8h, v1.8h, v8.h[0] + ld1 { v0.8h }, [ x1 ] + fmla v18.8h, v2.8h, v8.h[0] + fmla v19.8h, v3.8h, v8.h[0] + fmla v20.8h, v4.8h, v8.h[0] + fmla v21.8h, v5.8h, v8.h[0] + fmla v22.8h, v6.8h, v8.h[0] + fmla v23.8h, v7.8h, v8.h[0] + + fmla v24.8h, v9.8h, v8.h[0] + fmla v25.8h, v10.8h, v8.h[0] + fmla v26.8h, v11.8h, v8.h[0] + fmla v27.8h, v12.8h, v8.h[0] + fmla v28.8h, v13.8h, v8.h[0] + fmla v29.8h, v14.8h, v8.h[0] + fmla v30.8h, v15.8h, v8.h[0] + fmla v31.8h, v0.8h, v8.h[0] + + b .non_linear_loop + +{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:16, to:31%} +{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:128, from:16, to:31%} +{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:128, from:16, to:31%} +{% include "arm64fp16_mmm_load_tile.tmpliq" from:16, to:31 %} + +.add_unicast: + ldp x5, x6, [x0, #8] // c base ptr, rsc + cmp x6, #2 + beq .do_per_row_add + + {% for reg in (16..31) %} + {% for lane in (0..7) %} + ld1 {v0.h}[{{lane}}], [ x5 ], x6 + {% endfor %} + fadd v{{reg}}.8h, v{{reg}}.8h, v0.8h + {% endfor %} + + b .non_linear_loop + +.do_per_row_add: + ld1 {v0.8h-v3.8h}, [x5], #64 + ld1 {v4.8h-v7.8h}, [x5], #64 + ld1 {v8.8h-v11.8h}, [x5], #64 + ld1 {v12.8h-v15.8h}, [x5], #64 + + {% for r in (0..15) %} + fadd v{{r| plus: 16}}.8h, v{{r | plus: 16}}.8h, v{{r}}.8h + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldr x3, [x0, #16] + ldr x2, [x0, #8] + + ld1 {v8.h}[0], [ x3 ] + + {% for r in (0..7) %} + ldr q{{r}}, [x2], #16 + {% endfor %} + + fmla v16.8h, v0.8h, v8.h[0] + ldr q0, [x2], #16 + fmla v17.8h, v1.8h, v8.h[0] + ldr q1, [x2], #16 + fmla v18.8h, v2.8h, v8.h[0] + ldr q2, [x2], #16 + fmla v19.8h, v3.8h, v8.h[0] + ldr q3, [x2], #16 + fmla v20.8h, v4.8h, v8.h[0] + ldr q4, [x2], #16 + fmla v21.8h, v5.8h, v8.h[0] + ldr q5, [x2], #16 + fmla v22.8h, v6.8h, v8.h[0] + ldr q6, [x2], #16 + fmla v23.8h, v7.8h, v8.h[0] + ldr q7, [x2], #16 + + fmla v24.8h, v0.8h, v8.h[0] + fmla v25.8h, v1.8h, v8.h[0] + fmla v26.8h, v2.8h, v8.h[0] + fmla v27.8h, v3.8h, v8.h[0] + fmla v28.8h, v4.8h, v8.h[0] + fmla v29.8h, v5.8h, v8.h[0] + fmla v30.8h, v6.8h, v8.h[0] + fmla v31.8h, v7.8h, v8.h[0] + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc$ + + cmp x6, #2 + beq .store_strides_contig + + {% for reg in (16..31) %} + {% for lane in (0..7) %} + st1 { v{{reg}}.h }[{{lane}}], [ x5 ], x6 + {% endfor %} + {% endfor %} + b .non_linear_loop + +.store_strides_contig: + + {% for reg in (16..31) %} + st1 { v{{reg}}.8h }, [ x5 ], #16 + {% endfor %} + b .non_linear_loop + +.return: + + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop1/naive.tmpli new file mode 100644 index 000000000..a55fe12e8 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop1/naive.tmpli @@ -0,0 +1,21 @@ + +fmla v16.8h, v0.8h, v4.h[0] +fmla v17.8h, v1.8h, v4.h[0] +fmla v18.8h, v0.8h, v4.h[1] +fmla v19.8h, v1.8h, v4.h[1] +fmla v20.8h, v0.8h, v4.h[2] +fmla v21.8h, v1.8h, v4.h[2] +fmla v22.8h, v0.8h, v4.h[3] +fmla v23.8h, v1.8h, v4.h[3] + +fmla v24.8h, v0.8h, v4.h[4] +fmla v25.8h, v1.8h, v4.h[4] +fmla v26.8h, v0.8h, v4.h[5] +fmla v27.8h, v1.8h, v4.h[5] +fmla v28.8h, v0.8h, v4.h[6] +fmla v29.8h, v1.8h, v4.h[6] +fmla v30.8h, v0.8h, v4.h[7] +fmla v31.8h, v1.8h, v4.h[7] + +ld1 {{ v0.8h, v1.8h }}, [x1], #32 +ld1 {{ v4.8h }}, [x2], #16 diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop2/cortex_a55.tmpli new file mode 100644 index 000000000..3fef68ae7 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop2/cortex_a55.tmpli @@ -0,0 +1,54 @@ +fmla v16.8h, v0.8h, v4.h[0] +ldr d2, [x1], #8 +fmla v17.8h, v1.8h, v4.h[0] +ldr d6, [x2], #8 +fmla v18.8h, v0.8h, v4.h[1] +ldr x5, [x1], #8 +fmla v19.8h, v1.8h, v4.h[1] +ldr x7, [x2], #8 +fmla v20.8h, v0.8h, v4.h[2] +ldr d3, [x1], #8 +fmla v21.8h, v1.8h, v4.h[2] +fmla v22.8h, v0.8h, v4.h[3] +ldr x6, [x1], #8 +fmla v23.8h, v1.8h, v4.h[3] + +fmla v24.8h, v0.8h, v4.h[4] +fmla v25.8h, v1.8h, v4.h[4] +fmla v26.8h, v0.8h, v4.h[5] +fmla v27.8h, v1.8h, v4.h[5] +fmla v28.8h, v0.8h, v4.h[6] +ins v2.d[1], x5 +fmla v29.8h, v1.8h, v4.h[6] +ins v6.d[1], x7 +fmla v30.8h, v0.8h, v4.h[7] +ins v3.d[1], x6 +fmla v31.8h, v1.8h, v4.h[7] + +fmla v16.8h, v2.8h, v6.h[0] +ldr d0, [x1], #8 +fmla v17.8h, v3.8h, v6.h[0] +ldr d4, [x2], #8 +fmla v18.8h, v2.8h, v6.h[1] +ldr x5, [x1], #8 +fmla v19.8h, v3.8h, v6.h[1] +ldr x7, [x2], #8 +fmla v20.8h, v2.8h, v6.h[2] +ldr d1, [x1], #8 +fmla v21.8h, v3.8h, v6.h[2] +fmla v22.8h, v2.8h, v6.h[3] +ldr x6, [x1], #8 +fmla v23.8h, v3.8h, v6.h[3] + +fmla v24.8h, v2.8h, v6.h[4] +fmla v25.8h, v3.8h, v6.h[4] +fmla v26.8h, v2.8h, v6.h[5] +fmla v27.8h, v3.8h, v6.h[5] +fmla v28.8h, v2.8h, v6.h[6] +ins v0.d[1], x5 +fmla v29.8h, v3.8h, v6.h[6] +ins v4.d[1], x7 +fmla v30.8h, v2.8h, v6.h[7] +ins v1.d[1], x6 +fmla v31.8h, v3.8h, v6.h[7] + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8_core.tmpl new file mode 100644 index 000000000..a523751dc --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8_core.tmpl @@ -0,0 +1,174 @@ +// vim: ft=arm + +// x20..x27 are used, callee-preserved + +// C tile regs: v16 to v31, (scratch) +// +// v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0] +// v16[1] v18[1] +// v16[2] v18[2] +// v16[3] v18[3] +// +// v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0] +// v17[1] v19[1] +// v17[2] v19[2] +// v17[3] v19[3] + +// v8 is used, d8 (lower half) must preserved +// v0-v7 (scratch registers) +// packed A buffering (2x8 values): alternating v0, v1 with v2, v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +{% if needs_pragma == true %} +.cpu generic+fp+simd+fp16 +{% endif %} +.global {{G}}arm64fp16_mmm_f16_16x8_{{core}}_{{suffix}} +{{G}}arm64fp16_mmm_f16_16x8_{{core}}_{{suffix}}: + + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + stp x26, x27, [sp, #-16]! + + str q8, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldr x2, [x0, #24] // b + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + +.packed_packed: + ld1 { v0.4s, v1.4s }, [ x1 ], #32 + ld1 { v4.4s }, [ x2 ], #16 + +{% capture packed_packed_loop1 %} + {% include "arm64fp16_mmm_f16_16x8/loop1/naive.tmpli" %} +{% endcapture %} + +{% capture packed_packed_loop2 %} + {% if core == "a55" %} + {% include "arm64fp16_mmm_f16_16x8/loop2/cortex_a55.tmpli" %} + {% else %} + {{ packed_packed_loop1 }} + {{ packed_packed_loop1 }} + {% endif %} +{% endcapture %} + + cmp x3, #4 + blt .packed_packed_loop_1 + +.p2align 4 +.packed_packed_loop_4: + {{ packed_packed_loop2 }} + {{ packed_packed_loop2 }} + + sub x3, x3, #4 + cmp x3, #4 + bge .packed_packed_loop_4 + + + cmp x3, #0 + beq .non_linear_loop + +.p2align 4 +.packed_packed_loop_1: + {{ packed_packed_loop1 }} + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:16, to:31%} +{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:16, from:16, to:31 %} +{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:16, from:16, to:31 %} +{% include "arm64fp16_mmm_load_tile.tmpliq" from:16, to:31 %} + +.add_unicast: + ldp x5, x6, [x0, #8] + ldp x7, x8, [x0, #24] + + {% for col in (8..15) %} + mov x4, x5 + {% for reg in (0..1) %} + {% for lane in (0..7) %} + ld1 {v0.h}[{{lane}}], [ x4 ], x6 + {% endfor %} + fadd v{{col | times:2 | plus: reg}}.8h, v{{col | times:2 | plus: reg}}.8h, v0.8h + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldr x2, [x0, #8] + ldr x3, [x0, #16] + + ld1 { v0.4s, v1.4s }, [ x2 ], #32 + ld1 { v4.4s }, [ x3 ], #16 + + fmla v16.8h, v0.8h, v4.h[0] + fmla v17.8h, v1.8h, v4.h[0] + fmla v18.8h, v0.8h, v4.h[1] + fmla v19.8h, v1.8h, v4.h[1] + fmla v20.8h, v0.8h, v4.h[2] + fmla v21.8h, v1.8h, v4.h[2] + fmla v22.8h, v0.8h, v4.h[3] + fmla v23.8h, v1.8h, v4.h[3] + + fmla v24.8h, v0.8h, v4.h[4] + fmla v25.8h, v1.8h, v4.h[4] + fmla v26.8h, v0.8h, v4.h[5] + fmla v27.8h, v1.8h, v4.h[5] + fmla v28.8h, v0.8h, v4.h[6] + fmla v29.8h, v1.8h, v4.h[6] + fmla v30.8h, v0.8h, v4.h[7] + fmla v31.8h, v1.8h, v4.h[7] + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + cmp x6, #2 + bne .store_strides_generic + + {% for col in (8..15) %} + str q{{col | times:2 }}, [ x5 ] + str q{{col | times:2 | plus: 1}}, [ x5, #16 ] + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.store_strides_generic: + + {% for col in (8..15) %} + mov x4, x5 + {% for reg in (0..1) %} + {% for lane in (0..7) %} + st1 { v{{col | times:2 | plus: reg}}.h }[{{lane}}], [ x4 ], x6 + {% endfor %} + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.return: + ldr q8, [sp], #16 + + ldp x26, x27, [sp], #16 + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop1/naive.tmpli new file mode 100644 index 000000000..fa0b84887 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop1/naive.tmpli @@ -0,0 +1,21 @@ + +fmla v16.8h, v0.8h, v4.h[0] +fmla v17.8h, v1.8h, v4.h[0] +fmla v18.8h, v2.8h, v4.h[0] +fmla v19.8h, v3.8h, v4.h[0] +fmla v20.8h, v0.8h, v4.h[1] +fmla v21.8h, v1.8h, v4.h[1] +fmla v22.8h, v2.8h, v4.h[1] +fmla v23.8h, v3.8h, v4.h[1] + +fmla v24.8h, v0.8h, v4.h[2] +fmla v25.8h, v1.8h, v4.h[2] +fmla v26.8h, v2.8h, v4.h[2] +fmla v27.8h, v3.8h, v4.h[2] +fmla v28.8h, v0.8h, v4.h[3] +fmla v29.8h, v1.8h, v4.h[3] +fmla v30.8h, v2.8h, v4.h[3] +fmla v31.8h, v3.8h, v4.h[3] + +ld1 {{ v0.8h, v1.8h, v2.8h, v3.8h }}, [ x1 ], #64 +ldr d4, [x2], #8 diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop2/cortex_a55.tmpli new file mode 100644 index 000000000..2e64319d0 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop2/cortex_a55.tmpli @@ -0,0 +1,71 @@ +// mul a: v0, v1, v2, v3 b: v4 +// load a: v5(d5/x5), v6(d6,x6), v7(d7,x7), v8(d8, x8) +// load b: v9 as d9 + +fmla v16.8h, v0.8h, v4.h[0] +ldr d5, [x1], #8 +fmla v17.8h, v1.8h, v4.h[0] +ldr d9, [x2], #8 +fmla v18.8h, v2.8h, v4.h[0] +ldr x5, [x1], #8 +fmla v19.8h, v3.8h, v4.h[0] +fmla v20.8h, v0.8h, v4.h[1] +ldr d6, [x1], #8 +fmla v21.8h, v1.8h, v4.h[1] +ldr x6, [x1], #8 +fmla v22.8h, v2.8h, v4.h[1] +ldr d7, [x1], #8 +fmla v23.8h, v3.8h, v4.h[1] +ldr x7, [x1], #8 + +fmla v24.8h, v0.8h, v4.h[2] +ldr d8, [x1], #8 +fmla v25.8h, v1.8h, v4.h[2] +ldr x8, [x1], #8 +fmla v26.8h, v2.8h, v4.h[2] +ins v5.d[1], x5 +fmla v27.8h, v3.8h, v4.h[2] +ins v6.d[1], x6 +fmla v28.8h, v0.8h, v4.h[3] +ins v7.d[1], x7 +fmla v29.8h, v1.8h, v4.h[3] +ins v8.d[1], x8 +fmla v30.8h, v2.8h, v4.h[3] +ins v9.d[1], x9 +fmla v31.8h, v3.8h, v4.h[3] + +// mul a: v5, v6, v7, v8 b: v9 +// load a: v0(d0/x5), v1(d1,x6), v2(d2,x7), v3(d3, x8) +// load b: v4 as d4 + +fmla v16.8h, v5.8h, v9.h[0] +ldr d0, [x1], #8 +fmla v17.8h, v6.8h, v9.h[0] +ldr d4, [x2], #8 +fmla v18.8h, v7.8h, v9.h[0] +ldr x5, [x1], #8 +fmla v19.8h, v8.8h, v9.h[0] +fmla v20.8h, v5.8h, v9.h[1] +ldr d1, [x1], #8 +fmla v21.8h, v6.8h, v9.h[1] +ldr x6, [x1], #8 +fmla v22.8h, v7.8h, v9.h[1] +ldr d2, [x1], #8 +fmla v23.8h, v8.8h, v9.h[1] +ldr x7, [x1], #8 + +fmla v24.8h, v5.8h, v9.h[2] +ldr d3, [x1], #8 +fmla v25.8h, v6.8h, v9.h[2] +ldr x8, [x1], #8 +fmla v26.8h, v7.8h, v9.h[2] +ins v0.d[1], x5 +fmla v27.8h, v8.8h, v9.h[2] +ins v1.d[1], x6 +fmla v28.8h, v5.8h, v9.h[3] +ins v2.d[1], x7 +fmla v29.8h, v6.8h, v9.h[3] +ins v3.d[1], x8 +fmla v30.8h, v7.8h, v9.h[3] +ins v4.d[1], x9 +fmla v31.8h, v8.8h, v9.h[3] diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4_core.tmpl new file mode 100644 index 000000000..11cb30b87 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4_core.tmpl @@ -0,0 +1,165 @@ +// vim: ft=arm + +// x20..x27 are used, callee-preserved + +// C tile regs: v16 to v31, (scratch) + +// v8 is used, d8 (lower half) must preserved +// v0-v7 (scratch registers) +// packed A buffering (2x8 values): alternating v0, v1 with v2, v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +{% if needs_pragma == true %} +.cpu generic+fp+simd+fp16 +{% endif %} +.global {{G}}arm64fp16_mmm_f16_32x4_{{core}}_{{suffix}} +{{G}}arm64fp16_mmm_f16_32x4_{{core}}_{{suffix}}: + + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + stp x26, x27, [sp, #-16]! + + stp d8, d9, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldr x2, [x0, #24] // b + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64 + ldr d4, [x2], #8 + +{% capture packed_packed_loop1 %} + {% include "arm64fp16_mmm_f16_32x4/loop1/naive.tmpli" %} +{% endcapture %} + +{% capture packed_packed_loop2 %} + {% if core == "a55" %} + {% include "arm64fp16_mmm_f16_32x4/loop2/cortex_a55.tmpli" %} + {% else %} + {{ packed_packed_loop1 }} + {{ packed_packed_loop1 }} + {% endif %} +{% endcapture %} + + cmp x3, #4 + blt .packed_packed_loop_1 + +.p2align 4 +.packed_packed_loop_4: + {{ packed_packed_loop2 }} + {{ packed_packed_loop2 }} + + sub x3, x3, #4 + cmp x3, #4 + bge .packed_packed_loop_4 + + cmp x3, #0 + beq .non_linear_loop + +.p2align 4 +.packed_packed_loop_1: + {{ packed_packed_loop1 }} + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:16, to:31%} +{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:32, from:16, to:31 %} +{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:32, from:16, to:31 %} +{% include "arm64fp16_mmm_load_tile.tmpliq" from:16, to:31 %} + +.add_unicast: + ldp x5, x6, [x0, #8] + ldp x7, x8, [x0, #24] + + {% for col in (0..3) %} + mov x4, x5 + {% for reg in (0..3) %} + {% for lane in (0..7) %} + ld1 {v0.h}[{{lane}}], [ x4 ], x6 + {% endfor %} + fadd v{{col | times:4 | plus: 16| plus: reg}}.8h, v{{col | times:4 | plus: 16 | plus: reg}}.8h, v0.8h + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldr x2, [x0, #8] + ldr x3, [x0, #16] + + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [ x2 ] + ldr d4, [x3] + + fmla v16.8h, v0.8h, v4.h[0] + fmla v17.8h, v1.8h, v4.h[0] + fmla v18.8h, v2.8h, v4.h[0] + fmla v19.8h, v3.8h, v4.h[0] + fmla v20.8h, v0.8h, v4.h[1] + fmla v21.8h, v1.8h, v4.h[1] + fmla v22.8h, v2.8h, v4.h[1] + fmla v23.8h, v3.8h, v4.h[1] + + fmla v24.8h, v0.8h, v4.h[2] + fmla v25.8h, v1.8h, v4.h[2] + fmla v26.8h, v2.8h, v4.h[2] + fmla v27.8h, v3.8h, v4.h[2] + fmla v28.8h, v0.8h, v4.h[3] + fmla v29.8h, v1.8h, v4.h[3] + fmla v30.8h, v2.8h, v4.h[3] + fmla v31.8h, v3.8h, v4.h[3] + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + cmp x6, #2 + bne .store_strides_generic + + {% for col in (0..3) %} + str q{{col | times:4 | plus:16 | plus: 0}}, [ x5 ] + str q{{col | times:4 | plus:16 | plus: 1}}, [ x5, #16 ] + str q{{col | times:4 | plus:16 | plus: 2}}, [ x5, #32 ] + str q{{col | times:4 | plus:16 | plus: 3}}, [ x5, #48 ] + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.store_strides_generic: + + {% for col in (0..3) %} + mov x4, x5 + {% for reg in (0..3) %} + {% for lane in (0..7) %} + st1 { v{{col | times:4 | plus: 16 | plus: reg}}.h }[{{lane}}], [ x4 ], x6 + {% endfor %} + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.return: + ldp d8, d9, [sp], #16 + + ldp x26, x27, [sp], #16 + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x6.core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x6.core.tmpl new file mode 100644 index 000000000..b66d1d399 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x6.core.tmpl @@ -0,0 +1,148 @@ +// vim: ft=arm + +// C tile regs: v16 to v31, no need to preserve + +// no preservation either for v0-v7... +// v8..v15 are callee-preserved +// packed A buffering (2x8 values): alternating v0, v1 with v2, v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +{% if needs_pragma == true %} +.cpu generic+fp+simd+fp16 +{% endif %} +.global {{G}}arm64fp16_mmm_f16_32x6_{{core}}_{{suffix}} +{{G}}arm64fp16_mmm_f16_32x6_{{core}}_{{suffix}}: + + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldp x2, x4, [x0, #24] // b, packing + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + +.p2align 4 +.packed_packed_loop_1: + ld1 { v7.8h }, [ x2 ] + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [ x1 ], #64 + add x2, x2, 12 + +{% for row in (0..3) %} + {% for col in (0..5) %} + fmla v{{ col|times:4|plus:8|plus:row}}.8h, v{{row}}.8h, v7.h[{{col}}] + {% endfor %} + /* + {% for col in (0..1) %} + fmla v{{ col|plus:4|times:4|plus:8|plus:row}}.8h, v{{row}}.8h, v6.h[{{col}}] + {% endfor %} + */ +{% endfor %} + + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:8, to:31%} +{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:32, from:8, to:31%} +{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:32, from:8, to:31%} +{% include "arm64fp16_mmm_load_tile.tmpliq" from:8, to:31 %} + +.add_unicast: + ldp x5, x6, [x0, #8] + ldp x7, x8, [x0, #24] + + {% for col in (0..5) %} + mov x4, x5 + {% for reg in (0..3) %} + {% for lane in (0..7) %} + ld1 {v0.h}[{{lane}}], [ x4 ], x6 + {% endfor %} + fadd v{{col | times:4 | plus: 8| plus: reg}}.8h, v{{col | times:4 | plus: 8 | plus: reg}}.8h, v0.8h + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.do_per_row_add: + ld1 {v0.8h-v3.8h}, [x5], #64 + ld1 {v4.8h-v7.8h}, [x5], #64 + + {% for r in (0..7) %} + fadd v{{r| plus: 24}}.8h, v{{r | plus: 24}}.8h, v{{r}}.8h + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldp x2, x3, [x0, #8] + + ld1 { v7.d }[0], [ x3 ], #8 + ld1 { v7.s }[2], [ x3 ], #4 + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [ x2 ], #64 + +{% for row in (0..3) %} + {% for col in (0..5) %} + fmla v{{ col|times:4|plus:8|plus:row}}.8h, v{{row}}.8h, v7.h[{{col}}] + {% endfor %} +{% endfor %} + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + cmp x6, #2 + beq .store_strides_contig + + {% for col in (0..5) %} + mov x4, x5 + {% for reg in (0..3) %} + {% for lane in (0..7) %} + st1 { v{{col | times:4 | plus: 8 | plus: reg}}.h }[{{lane}}], [ x4 ], x6 + {% endfor %} + {% endfor %} + add x5, x5, x7 + {% endfor %} + b .non_linear_loop + +.store_strides_contig: + + {% for col in (0..5) %} + mov x4, x5 + {% for r in (0..3) %} + st1 { v{{col | times:4 | plus: 8 | plus: r}}.8h }, [ x4 ], 16 + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.return: + + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x1.core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x1.core.tmpl new file mode 100644 index 000000000..b12e9237d --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x1.core.tmpl @@ -0,0 +1,264 @@ +// vim: ft=arm + +// C tile regs: v16 to v31, no need to preserve + +// no preservation either for v0-v7... +// v8..v15 are callee-preserved +// packed A buffering (2x8 values): alternating v0, v1 with v2, v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +{% if needs_pragma == true %} +.cpu generic+fp+simd+fp16 +{% endif %} +.global {{G}}arm64fp16_mmm_f16_64x1_{{core}}_{{suffix}} +{{G}}arm64fp16_mmm_f16_64x1_{{core}}_{{suffix}}: + + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldp x2, x4, [x0, #24] // b, packing + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + + cmp x4, #1 + beq .q4f16se + + cmp x4, #2 + beq .q4f16 + + + +.p2align 4 +.packed_packed_loop_1: + ld1 { v8.h }[0], [ x2 ], #2 + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [ x1 ], #64 + ld1 { v4.8h, v5.8h, v6.8h, v7.8h }, [ x1 ], #64 + + fmla v24.8h, v0.8h, v8.h[0] + fmla v25.8h, v1.8h, v8.h[0] + fmla v26.8h, v2.8h, v8.h[0] + fmla v27.8h, v3.8h, v8.h[0] + fmla v28.8h, v4.8h, v8.h[0] + fmla v29.8h, v5.8h, v8.h[0] + fmla v30.8h, v6.8h, v8.h[0] + fmla v31.8h, v7.8h, v8.h[0] + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +.p2align 8 +.q40f16_const: + .byte 0xc8, 0xc7, 0xc6, 0xc5, 0xc4, 0xc2, 0xc0, 0xbc + .byte 0x00, 0x3c, 0x40, 0x42, 0x44, 0x45, 0x46, 0x47 + +.q4f16se: + adr x4, .q40f16_const + movi v15.16b, 15 + ld1 {v13.16b}, [ x4 ] + eor v12.16b, v12.16b, v12.16b + +.q4f16se_outerloop: +{% for i in (0..7) %} + eor v{{i|plus:16}}.16b, v{{i|plus:16}}.16b, v{{i|plus:16}}.16b +{% endfor %} + mov x4, #32 + +.p2align 4 +.q4f16se_innerloop: + ld1 { v9.16b-v10.16b }, [x1], #32 + ld1 { v8.h }[0], [ x2 ], #2 + + and v0.16b, v9.16b, v15.16b + ushr v2.16b, v9.16b, 4 + + and v4.16b, v10.16b, v15.16b + ushr v6.16b, v10.16b, 4 + + tbl v0.16b, { v13.16b }, v0.16b + tbl v2.16b, { v13.16b }, v2.16b + tbl v4.16b, { v13.16b }, v4.16b + tbl v6.16b, { v13.16b }, v6.16b + + zip2 v1.16b, v12.16b, v0.16b + zip2 v3.16b, v12.16b, v2.16b + zip2 v5.16b, v12.16b, v4.16b + zip2 v7.16b, v12.16b, v6.16b + + zip1 v0.16b, v12.16b, v0.16b + zip1 v2.16b, v12.16b, v2.16b + zip1 v4.16b, v12.16b, v4.16b + zip1 v6.16b, v12.16b, v6.16b + +{% for i in (0..7) %} + fmla v{{ i|plus: 16 }}.8h, v{{i}}.8h, v8.h[0] +{% endfor %} + + subs x4, x4, #1 + bne .q4f16se_innerloop + + // scales + ld1 { v0.8h-v3.8h }, [ x1 ], #64 + ld1 { v4.8h-v7.8h }, [ x1 ], #64 + +{% for i in (0..7) %} + fmla v{{i|plus:24}}.8h, v{{i}}.8h, v{{i|plus:16}}.8h +{% endfor %} + + subs x3, x3, #32 + bne .q4f16se_outerloop + + b .non_linear_loop + +.q4f16: + adr x4, .q40f16_const + movi v15.16b, 15 + ld1 {v13.16b}, [ x4 ] + eor v12.16b, v12.16b, v12.16b + +.q4f16_outerloop: + // scales + ld1 { v16.8h-v19.8h }, [ x1 ], #64 + ld1 { v20.8h-v23.8h }, [ x1 ], #64 + mov x4, #32 + +.p2align 4 +.q4f16_innerloop: + ld1 { v9.16b-v10.16b }, [x1], #32 + ld1 { v8.h }[0], [ x2 ], #2 + + and v0.16b, v9.16b, v15.16b + ushr v2.16b, v9.16b, 4 + + and v4.16b, v10.16b, v15.16b + ushr v6.16b, v10.16b, 4 + + tbl v0.16b, { v13.16b }, v0.16b + tbl v2.16b, { v13.16b }, v2.16b + tbl v4.16b, { v13.16b }, v4.16b + tbl v6.16b, { v13.16b }, v6.16b + + zip2 v1.16b, v12.16b, v0.16b + zip2 v3.16b, v12.16b, v2.16b + zip2 v5.16b, v12.16b, v4.16b + zip2 v7.16b, v12.16b, v6.16b + + zip1 v0.16b, v12.16b, v0.16b + zip1 v2.16b, v12.16b, v2.16b + zip1 v4.16b, v12.16b, v4.16b + zip1 v6.16b, v12.16b, v6.16b + +{% for i in (0..7) %} + fmul v{{i}}.8h, v{{i}}.8h, v{{i|plus:16}}.8h +{% endfor %} + +{% for i in (0..7) %} + fmla v{{ i|plus: 24 }}.8h, v{{i}}.8h, v8.h[0] +{% endfor %} + + subs x4, x4, #1 + bne .q4f16_innerloop + + subs x3, x3, #32 + bne .q4f16_outerloop + + b .non_linear_loop + +{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:24, to:31%} +{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:64, from:24, to:31%} +{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:64, from:24, to:31%} +{% include "arm64fp16_mmm_load_tile.tmpliq" from:24, to:31 %} + +.add_unicast: + ldp x5, x6, [x0, #8] // c base ptr, rsc + cmp x6, #2 + beq .do_per_row_add + + {% for reg in (24..31) %} + {% for lane in (0..7) %} + ld1 {v0.h}[{{lane}}], [ x5 ], x6 + {% endfor %} + fadd v{{reg}}.8h, v{{reg}}.8h, v0.8h + {% endfor %} + + b .non_linear_loop + +.do_per_row_add: + ld1 {v0.8h-v3.8h}, [x5], #64 + ld1 {v4.8h-v7.8h}, [x5], #64 + + {% for r in (0..7) %} + fadd v{{r| plus: 24}}.8h, v{{r | plus: 24}}.8h, v{{r}}.8h + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldr x3, [x0, #16] + ldr x2, [x0, #8] + + ld1 {v8.h}[0], [ x3 ] + + {% for r in (0..7) %} + ldr q{{r}}, [x2], #16 + {% endfor %} + + fmla v24.8h, v0.8h, v8.h[0] + fmla v25.8h, v1.8h, v8.h[0] + fmla v26.8h, v2.8h, v8.h[0] + fmla v27.8h, v3.8h, v8.h[0] + fmla v28.8h, v4.8h, v8.h[0] + fmla v29.8h, v5.8h, v8.h[0] + fmla v30.8h, v6.8h, v8.h[0] + fmla v31.8h, v7.8h, v8.h[0] + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc$ + + cmp x6, #2 + beq .store_strides_contig + + {% for reg in (24..31) %} + {% for lane in (0..7) %} + st1 { v{{reg}}.h }[{{lane}}], [ x5 ], x6 + {% endfor %} + {% endfor %} + b .non_linear_loop + +.store_strides_contig: + + {% for reg in (24..31) %} + st1 { v{{reg}}.8h }, [ x5 ], #16 + {% endfor %} + + b .non_linear_loop + +.return: + + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x3.core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x3.core.tmpl new file mode 100644 index 000000000..9e949531a --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x3.core.tmpl @@ -0,0 +1,165 @@ +// vim: ft=arm + +// C tile regs: v16 to v31, no need to preserve + +// no preservation either for v0-v7... +// v8..v15 are callee-preserved +// packed A buffering (2x8 values): alternating v0, v1 with v2, v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +{% if needs_pragma == true %} +.cpu generic+fp+simd+fp16 +{% endif %} +.global {{G}}arm64fp16_mmm_f16_64x3_{{core}}_{{suffix}} +{{G}}arm64fp16_mmm_f16_64x3_{{core}}_{{suffix}}: + + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldp x2, x4, [x0, #24] // b, packing + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + +.p2align 4 +.packed_packed_loop_1: + ld1 { v7.4s }, [ x2 ] + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [ x1 ], #64 + ld1 { v4.8h, v5.8h, v6.8h }, [ x1 ], #48 + add x2, x2, #6 + +{% for col in (0..2) %} + fmla v{{ col|times:8|plus:8}}.8h, v0.8h, v7.h[{{ col }}] +{% endfor %} + + ld1 { v0.8h }, [ x1 ], #16 + +{% for row in (1..6) %} + {% for col in (0..2) %} + fmla v{{ col|times:8|plus:8|plus:row}}.8h, v{{row}}.8h, v7.h[{{col}}] + {% endfor %} +{% endfor %} + +{% for col in (0..2) %} + fmla v{{ col|times:8|plus:15}}.8h, v0.8h, v7.h[{{ col }}] +{% endfor %} + + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:8, to:31%} +{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:64, from:8, to:31%} +{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:64, from:8, to:31%} +{% include "arm64fp16_mmm_load_tile.tmpliq" from:8, to:31 %} + +.add_unicast: + ldp x5, x6, [x0, #8] + ldp x7, x8, [x0, #24] + + {% for col in (0..2) %} + mov x4, x5 + {% for reg in (0..7) %} + {% for lane in (0..7) %} + ld1 {v0.h}[{{lane}}], [ x4 ], x6 + {% endfor %} + fadd v{{col | times:8 | plus: 8| plus: reg}}.8h, v{{col | times:8 | plus: 8 | plus: reg}}.8h, v0.8h + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.do_per_row_add: + ld1 {v0.8h-v3.8h}, [x5], #64 + ld1 {v4.8h-v7.8h}, [x5], #64 + + {% for r in (0..7) %} + fadd v{{r| plus: 24}}.8h, v{{r | plus: 24}}.8h, v{{r}}.8h + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldp x2, x3, [x0, #8] + + ld1 { v7.s }[0], [ x3 ], #4 + ld1 { v7.h }[2], [ x3 ], #2 + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [ x2 ], #64 + ld1 { v4.8h, v5.8h, v6.8h }, [ x2 ], #48 + +{% for col in (0..2) %} + fmla v{{ col|times:8|plus:8}}.8h, v0.8h, v7.h[{{ col }}] +{% endfor %} + + ld1 { v0.8h }, [ x2 ], #16 + +{% for row in (1..6) %} + {% for col in (0..2) %} + fmla v{{ col|times:8|plus:8|plus:row}}.8h, v{{row}}.8h, v7.h[{{col}}] + {% endfor %} +{% endfor %} + +{% for col in (0..2) %} + fmla v{{ col|times:8|plus:15}}.8h, v0.8h, v7.h[{{ col }}] +{% endfor %} + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + cmp x6, #2 + beq .store_strides_contig + + {% for col in (0..2) %} + mov x4, x5 + {% for reg in (0..7) %} + {% for lane in (0..7) %} + st1 { v{{col | times:8 | plus: 8 | plus: reg}}.h }[{{lane}}], [ x4 ], x6 + {% endfor %} + {% endfor %} + add x5, x5, x7 + {% endfor %} + b .non_linear_loop + +.store_strides_contig: + + {% for col in (0..2) %} + mov x4, x5 + {% for r in (0..7) %} + st1 { v{{col | times:8 | plus: 8 | plus: r}}.8h }, [ x4 ], 16 + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.return: + + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_cols.tmpliq new file mode 100644 index 000000000..6e9552bf1 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_cols.tmpliq @@ -0,0 +1,9 @@ +// vim: ft=arm + +{% include "arm64fp16_mmm_8h_per_col.tmpliq" label:"per_col_min", op:"fmin", mr:mr, from:from, to:to %} +{% include "arm64fp16_mmm_8h_per_col.tmpliq" label:"per_col_max", op:"fmax", mr:mr, from:from, to:to %} +{% include "arm64fp16_mmm_8h_per_col.tmpliq" label:"per_col_mul", op:"fmul", mr:mr, from:from, to:to %} +{% include "arm64fp16_mmm_8h_per_col.tmpliq" label:"per_col_add", op:"fadd", mr:mr, from:from, to:to %} +{% include "arm64fp16_mmm_8h_per_col.tmpliq" label:"per_col_sub", op:"fsub", mr:mr, from:from, to:to %} +{% include "arm64fp16_mmm_8h_per_col.tmpliq" label:"per_col_sub_flipped", op:"fsub", mr:mr, from:from, to:to, flipped: true%} + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_rows.tmpliq new file mode 100644 index 000000000..477b8db1f --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_rows.tmpliq @@ -0,0 +1,9 @@ +// vim: ft=arm + +{% include "arm64fp16_mmm_8h_per_row.tmpliq" label:"per_row_min", op:"fmin", mr:mr, from:from, to:to %} +{% include "arm64fp16_mmm_8h_per_row.tmpliq" label:"per_row_max", op:"fmax", mr:mr, from:from, to:to %} +{% include "arm64fp16_mmm_8h_per_row.tmpliq" label:"per_row_mul", op:"fmul", mr:mr, from:from, to:to %} +{% include "arm64fp16_mmm_8h_per_row.tmpliq" label:"per_row_add", op:"fadd", mr:mr, from:from, to:to %} +{% include "arm64fp16_mmm_8h_per_row.tmpliq" label:"per_row_sub", op:"fsub", mr:mr, from:from, to:to %} +{% include "arm64fp16_mmm_8h_per_row.tmpliq" label:"per_row_sub_flipped", op:"fsub", mr:mr, from:from, to:to, flipped: true%} + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_scalars.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_scalars.tmpliq new file mode 100644 index 000000000..a448fe387 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_scalars.tmpliq @@ -0,0 +1,36 @@ +// vim: ft=arm + +{% include "arm64fp16_mmm_8h_scalar.tmpliq" label:"scalar_min", op:"fmin", from:from, to:to %} +{% include "arm64fp16_mmm_8h_scalar.tmpliq" label:"scalar_max", op:"fmax", from:from, to:to %} +{% include "arm64fp16_mmm_8h_scalar.tmpliq" label:"scalar_mul", op:"fmul", from:from, to:to %} +{% include "arm64fp16_mmm_8h_scalar.tmpliq" label:"scalar_add", op:"fadd", from:from, to:to %} +{% include "arm64fp16_mmm_8h_scalar.tmpliq" label:"scalar_sub", op:"fsub", from:from, to:to %} +{% include "arm64fp16_mmm_8h_scalar.tmpliq" label:"scalar_sub_flipped", op:"fsub", from:from, to:to, flipped:true %} + +.clear: +{% for r in (from..to) %} + eor v{{r}}.8b, v{{r}}.8b, v{{r}}.8b +{% endfor %} + b .non_linear_loop + +.leaky_relu: + add x2, x0, #8 + ld1 {v4.s}[0], [ x2 ] + dup v4.8h, v4.h[0] + + // bsl cond/dst, then, else + // fcmge dst, src, #0.0 + {% for r in (from..to) %} + fmul v0.8h, v{{r}}.8h, v4.8h + fcmge v1.8h, v{{r}}.8h, #0.0 + bsl v1.16b, v{{r}}.16b, v0.16b + and v{{r}}.16b, v1.16b, v1.16b + {% endfor %} + + b .non_linear_loop + + +.q_scale: +.q_shl: +.q_shr: + b .unsupported diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_load_tile.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_load_tile.tmpliq new file mode 100644 index 000000000..ac920b368 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_load_tile.tmpliq @@ -0,0 +1,10 @@ +// vim: ft=arm + +.load_tile: + ldr x2, [ x0, #8 ] + {% for reg in (from..to) %} + ld1 { v{{reg}}.4s }, [ x2 ], #16 + {% endfor %} + + b .non_linear_loop + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_sigmoid_f16_8n.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_sigmoid_f16_8n.tmpl new file mode 100644 index 000000000..7d0e76ef3 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_sigmoid_f16_8n.tmpl @@ -0,0 +1,131 @@ +// vim: ft=arm + +// no preservation either for v0-v7 and v16-v31 + +.text +.align 4 + +{% if needs_pragma == true %} +.cpu generic+fp+simd+fp16 +{% endif %} +.global {{G}}arm64fp16_sigmoid_f16_8n_{{suffix}} +{{G}}arm64fp16_sigmoid_f16_8n_{{suffix}}: + + cmp x1, #0 + beq .return + + adr x2, .coeffs_num + ld1 { v0.8h }, [x2] + dup v5.8h, v0.h[0] // v5 <- low, broadcasted + dup v6.8h, v0.h[1] // v6 <- high, broadcasted + dup v7.8h, v0.h[7] // v7 <- half, broadcasted + + cmp x1, #32 + blt .loop + +.loop4: + ld1 { v16.8h, v17.8h, v18.8h, v19.8h }, [x0] + + fmax v16.8h, v16.8h, v5.8h + fmax v17.8h, v17.8h, v5.8h + fmax v18.8h, v18.8h, v5.8h + fmax v19.8h, v19.8h, v5.8h + + fmin v16.8h, v16.8h, v6.8h + fmin v17.8h, v17.8h, v6.8h + fmin v18.8h, v18.8h, v6.8h + fmin v19.8h, v19.8h, v6.8h // v16 <- x + + fmul v20.8h, v16.8h, v16.8h + fmul v21.8h, v17.8h, v17.8h + fmul v22.8h, v18.8h, v18.8h + fmul v23.8h, v19.8h, v19.8h // v20 <- x2 + + dup v28.8h, v0.h[3] + fmla v28.8h, v20.8h, v0.h[2] + dup v29.8h, v0.h[3] + fmla v29.8h, v21.8h, v0.h[2] + dup v30.8h, v0.h[3] + fmla v30.8h, v22.8h, v0.h[2] + dup v31.8h, v0.h[3] + fmla v31.8h, v23.8h, v0.h[2] + + dup v24.8h, v0.h[4] + fmla v24.8h, v20.8h, v28.8h + dup v25.8h, v0.h[4] + fmla v25.8h, v21.8h, v29.8h + dup v26.8h, v0.h[4] + fmla v26.8h, v22.8h, v30.8h + dup v27.8h, v0.h[4] + fmla v27.8h, v23.8h, v31.8h + + fmul v16.8h, v16.8h, v24.8h + fmul v17.8h, v17.8h, v25.8h + fmul v18.8h, v18.8h, v26.8h + fmul v19.8h, v19.8h, v27.8h // v16 <- numerator + + dup v24.8h, v0.h[6] + dup v25.8h, v0.h[6] + dup v26.8h, v0.h[6] + dup v27.8h, v0.h[6] + fmla v24.8h, v20.8h, v0.h[5] + fmla v25.8h, v21.8h, v0.h[5] + fmla v26.8h, v22.8h, v0.h[5] + fmla v27.8h, v23.8h, v0.h[5] // v24 <- denum + + fdiv v16.8h, v16.8h, v24.8h + fdiv v17.8h, v17.8h, v25.8h + fdiv v18.8h, v18.8h, v26.8h + fdiv v19.8h, v19.8h, v27.8h + + fadd v16.8h, v16.8h, v7.8h + fadd v17.8h, v17.8h, v7.8h + fadd v18.8h, v18.8h, v7.8h + fadd v19.8h, v19.8h, v7.8h + + st1 { v16.8h, v17.8h, v18.8h, v19.8h }, [x0], #64 + + subs x1, x1, #32 + cmp x1, #32 + bge .loop4 + + cmp x1, #0 + beq .return + +.loop: + ld1 { v16.8h }, [x0] + + fmax v16.8h, v16.8h, v5.8h + fmin v16.8h, v16.8h, v6.8h // v16 <- x + fmul v20.8h, v16.8h, v16.8h // v20 <- x2 + + dup v28.8h, v0.h[3] + fmla v28.8h, v20.8h, v0.h[2] + dup v24.8h, v0.h[4] + fmla v24.8h, v20.8h, v28.8h + fmul v16.8h, v16.8h, v24.8h // v16 <- numerator + + dup v24.8h, v0.h[6] + fmla v24.8h, v20.8h, v0.h[5] // v24 <- denum + + fdiv v16.8h, v16.8h, v24.8h + fadd v16.8h, v16.8h, v7.8h + + st1 { v16.8h }, [x0], #16 + + subs x1, x1, #8 + bne .loop + +.return: + ret + +.coeffs_num: + {{ -6.92 | float16 }} + {{ 6.92 | float16 }} + {{ -0.0000124702 | float16 }} + {{ 0.00400222 | float16 }} + + {{ 0.249895 | float16 }} + {{ 0.098734 | float16 }} + {{ 1.0 | float16 }} + {{ 0.5 | float16 }} diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_tanh_f16_8n.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_tanh_f16_8n.tmpl new file mode 100644 index 000000000..cd01f0455 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_tanh_f16_8n.tmpl @@ -0,0 +1,124 @@ +// vim: ft=arm + +// no preservation either for v0-v7 and v16-v31 + +.text +.align 4 + +{% if needs_pragma == true %} +.cpu generic+fp+simd+fp16 +{% endif %} +.global {{G}}arm64fp16_tanh_f16_8n_{{suffix}} +{{G}}arm64fp16_tanh_f16_8n_{{suffix}}: + + cmp x1, #0 + beq .return + + adr x2, .coeffs_num + ld1 { v0.8h }, [x2] + dup v5.8h, v0.h[0] // v5 <- low, broadcasted + dup v6.8h, v0.h[1] // v6 <- high, broadcasted + + cmp x1, #32 + blt .loop + +.loop4: + ld1 { v16.8h, v17.8h, v18.8h, v19.8h }, [x0] + + fmax v16.8h, v16.8h, v5.8h + fmax v17.8h, v17.8h, v5.8h + fmax v18.8h, v18.8h, v5.8h + fmax v19.8h, v19.8h, v5.8h + + fmin v16.8h, v16.8h, v6.8h + fmin v17.8h, v17.8h, v6.8h + fmin v18.8h, v18.8h, v6.8h + fmin v19.8h, v19.8h, v6.8h // v16 <- x + + fmul v20.8h, v16.8h, v16.8h + fmul v21.8h, v17.8h, v17.8h + fmul v22.8h, v18.8h, v18.8h + fmul v23.8h, v19.8h, v19.8h // v20 <- x2 + + dup v24.8h, v0.h[3] + fmla v24.8h, v20.8h, v0.h[2] + dup v25.8h, v0.h[3] + fmla v25.8h, v21.8h, v0.h[2] + dup v26.8h, v0.h[3] + fmla v26.8h, v22.8h, v0.h[2] + dup v27.8h, v0.h[3] + fmla v27.8h, v23.8h, v0.h[2] + + fmul v16.8h, v16.8h, v24.8h + fmul v17.8h, v17.8h, v25.8h + fmul v18.8h, v18.8h, v26.8h + fmul v19.8h, v19.8h, v27.8h // v16 <- numerator + + dup v28.8h, v0.h[5] + fmla v28.8h, v20.8h, v0.h[4] + dup v29.8h, v0.h[5] + fmla v29.8h, v21.8h, v0.h[4] + dup v30.8h, v0.h[5] + fmla v30.8h, v22.8h, v0.h[4] + dup v31.8h, v0.h[5] + fmla v31.8h, v23.8h, v0.h[4] + + dup v24.8h, v0.h[6] + fmla v24.8h, v20.8h, v28.8h + dup v25.8h, v0.h[6] + fmla v25.8h, v21.8h, v29.8h + dup v26.8h, v0.h[6] + fmla v26.8h, v22.8h, v30.8h + dup v27.8h, v0.h[6] + fmla v27.8h, v23.8h, v31.8h // v24 <- denum + + fdiv v16.8h, v16.8h, v24.8h + fdiv v17.8h, v17.8h, v25.8h + fdiv v18.8h, v18.8h, v26.8h + fdiv v19.8h, v19.8h, v27.8h + + st1 { v16.8h, v17.8h, v18.8h, v19.8h }, [x0], #64 + + subs x1, x1, #32 + cmp x1, #32 + bge .loop4 + + cmp x1, #0 + beq .return + +.loop: + ld1 { v16.8h }, [x0] + + fmax v16.8h, v16.8h, v5.8h + fmin v16.8h, v16.8h, v6.8h // v16 <- x + fmul v20.8h, v16.8h, v16.8h // v20 <- x2 + + dup v24.8h, v0.h[3] + fmla v24.8h, v20.8h, v0.h[2] + fmul v16.8h, v16.8h, v24.8h // v16 <- numerator + + dup v28.8h, v0.h[5] + fmla v28.8h, v20.8h, v0.h[4] + dup v24.8h, v0.h[6] + fmla v24.8h, v20.8h, v28.8h // v24 <- denum + + fdiv v16.8h, v16.8h, v24.8h + + st1 { v16.8h }, [x0], #16 + + subs x1, x1, #8 + bne .loop + +.return: + ret + +.coeffs_num: + {{ -3.84 | float16 }} + {{ 3.84 | float16 }} + {{ 0.082654955 | float16 }} // alpha + {{ 0.99963124 | float16 }} + + {{ 0.0065383179 | float16 }} // beta + {{ 0.41401828 | float16 }} + {{ 1.0 | float16 }} + {{ 0 | float16 }} // padding diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dispatcher.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dispatcher.tmpliq new file mode 100644 index 000000000..150db4683 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dispatcher.tmpliq @@ -0,0 +1,37 @@ +// vim: ft=arm + +.non_linear: + sub x0, x0, 40 + +.non_linear_loop: + add x0, x0, 40 + ldr x2, [x0] + + mov x4, #{{ jump_table | size }} + + cmp x2, #{{ jump_table | size }} + csel x2, x2, x4, lt + cmp x2, #0 + csel x2, x4, x2, lt + + adr x3, .jmp_table + add x3, x3, x2, LSL#2 + br x3 + +.jmp_table: +{% for j in jump_table %} + b .{{j}} +{% endfor %} + b .unsupported + + add x0, x2, #4000 + b .return + +.unsupported: + mov x0, #1 + b .return + +.done: + mov x0, 0 + b .return + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_no_pragma.S b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_no_pragma.S new file mode 100644 index 000000000..3af092cfc --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_no_pragma.S @@ -0,0 +1,13 @@ +// vim: ft=arm + +// serves as a canary build file to figure out which flag combination will accept half precision fmla + +.text +.align 4 + +// .cpu generic+fp+simd+fp16 +.global foo +foo: + fmla v16.8h, v0.8h, v8.h[0] + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_pragma.S b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_pragma.S new file mode 100644 index 000000000..6fb61053e --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_pragma.S @@ -0,0 +1,13 @@ +// vim: ft=arm + +// serves as a canary build file to figure out which flag combination will accept half precision fmla + +.text +.align 4 + +.cpu generic+fp+simd+fp16 +.global foo +foo: + fmla v16.8h, v0.8h, v8.h[0] + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_col.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_col.tmpliq new file mode 100644 index 000000000..a50fa7be9 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_col.tmpliq @@ -0,0 +1,36 @@ +// vim: ft=arm + +.{{label}}: + ldr x2, [x0, #8] + +{% capture mr_over_4 %}{{ mr | divided_by: 4}}{%endcapture%} +{% capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_4}}{%endcapture%} + +{% capture loads %}{{cols | divided_by:4}}{% endcapture %} + +{%if cols == "1" %} + ld1 {v0.s}[0], [ x2 ] +{% elsif cols == "3" %} + ld1 {v0.d}[0], [ x2 ], #8 + ld1 {v0.s}[2], [ x2 ] +{% else %} + {% for reg in (1..loads) %} + ldr q{{reg |minus:1}}, [ x2 ], #16 + {% endfor %} +{% endif %} + +// {{mr}} {{cols}} + +{% for col in (1..cols) %} + dup v3.4s, v{{col| minus: 1|divided_by:4}}.s[{{col| minus: 1|modulo:4}}] + {% for row in (1..mr_over_4) %} + {% capture acc %}{{ col|minus:1|times:mr_over_4|plus:row|minus:1|plus:from }}{% endcapture %} + {% if flipped %} + {{op}} v{{acc}}.4s, v{{acc}}.4s, v3.4s + {% else %} + {{op}} v{{acc}}.4s, v3.4s, v{{acc}}.4s + {% endif %} + {% endfor %} +{% endfor %} + +b .non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_row.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_row.tmpliq new file mode 100644 index 000000000..1db62f2b7 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_row.tmpliq @@ -0,0 +1,25 @@ +// vim: ft=arm + +.{{label}}: + ldr x2, [x0, #8] + +{% capture mr_over_4 %}{{ mr | divided_by: 4}}{%endcapture%} +{% capture mr_over_4_min_1 %}{{ mr | divided_by: 4 | minus: 1}}{%endcapture%} + +{% for reg in (0..mr_over_4_min_1) %} + ldr q{{reg}}, [ x2 ], #16 +{% endfor %} + +{% if flipped %} + {% for acc in (from..to) %} + {% capture other%}{{acc | minus: from | modulo: mr_over_4}}{%endcapture%} + {{op}} v{{acc}}.4s, v{{acc}}.4s, v{{other}}.4s + {% endfor %} +{% else %} + {% for acc in (from..to) %} + {% capture other%}{{acc | minus: from | modulo: mr_over_4}}{%endcapture%} + {{op}} v{{acc}}.4s, v{{other}}.4s, v{{acc}}.4s + {% endfor %} +{% endif %} + +b .non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_scalar.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_scalar.tmpliq new file mode 100644 index 000000000..db2e10143 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_scalar.tmpliq @@ -0,0 +1,18 @@ +// vim: ft=arm + +.{{label}}: + add x2, x0, #8 + ld1 {v0.s}[0], [ x2 ] + dup v0.4s, v0.s[0] + {% if flipped %} + {% for reg in (from..to) %} + {{op}} v{{reg}}.4s, v{{reg}}.4s, v0.4s + {% endfor %} + {% else %} + {% for reg in (from..to) %} + {{op}} v{{reg}}.4s, v0.4s, v{{reg}}.4s + {% endfor %} + {% endif %} + + b .non_linear_loop + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_no_preload.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_no_preload.tmpli new file mode 100644 index 000000000..04deaee6c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_no_preload.tmpli @@ -0,0 +1,69 @@ +fmla v8.4s, v0.4s, v4.s[0] +ldr w4, [x1], #4 +fmla v9.4s, v1.4s, v4.s[0] +ldr w20, [x2], #4 +fmla v10.4s, v2.4s, v4.s[0] +ldr w5, [x1], #4 + +fmla v11.4s, v0.4s, v4.s[1] +ldr w21, [x2], #4 +fmla v12.4s, v1.4s, v4.s[1] +ldr w6, [x1], #4 +fmla v13.4s, v2.4s, v4.s[1] +ldr w22, [x2], #4 + +fmla v14.4s, v0.4s, v4.s[2] +ldr w7, [x1], #4 +fmla v15.4s, v1.4s, v4.s[2] +ldr w23, [x2], #4 +fmla v16.4s, v2.4s, v4.s[2] +ldr w8, [x1], #4 +fmla v17.4s, v0.4s, v4.s[3] +ldr w24, [x2], #4 +fmla v18.4s, v1.4s, v4.s[3] +ldr w9, [x1], #4 +fmla v19.4s, v2.4s, v4.s[3] +ldr w25, [x2], #4 + +fmla v20.4s, v0.4s, v5.s[0] +ldr w10, [x1], #4 +fmla v21.4s, v1.4s, v5.s[0] +ldr w26, [x2], #4 +fmla v22.4s, v2.4s, v5.s[0] +ldr w11, [x1], #4 +fmla v23.4s, v0.4s, v5.s[1] +ldr w27, [x2], #4 +fmla v24.4s, v1.4s, v5.s[1] +ldr w12, [x1], #4 +fmla v25.4s, v2.4s, v5.s[1] + +fmla v26.4s, v0.4s, v5.s[2] +ldr w13, [x1], #4 +fmla v27.4s, v1.4s, v5.s[2] +fmla v28.4s, v2.4s, v5.s[2] +ldr w14, [x1], #4 +fmla v29.4s, v0.4s, v5.s[3] +fmla v30.4s, v1.4s, v5.s[3] +ldr w15, [x1], #4 +fmla v31.4s, v2.4s, v5.s[3] + +ins v0.s[0], w4 +ins v1.s[0], w8 +ins v2.s[0], w12 +ins v4.s[0], w20 +ins v5.s[0], w24 +ins v0.s[1], w5 +ins v1.s[1], w9 +ins v2.s[1], w13 +ins v4.s[1], w21 +ins v5.s[1], w25 +ins v0.s[2], w6 +ins v1.s[2], w10 +ins v2.s[2], w14 +ins v4.s[2], w22 +ins v5.s[2], w26 +ins v0.s[3], w7 +ins v1.s[3], w11 +ins v2.s[3], w15 +ins v4.s[3], w23 +ins v5.s[3], w27 diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_preload.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_preload.tmpli new file mode 100644 index 000000000..f97e2527c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_preload.tmpli @@ -0,0 +1,82 @@ +fmla v8.4s, v0.4s, v4.s[0] + ldr w4, [x1] +fmla v9.4s, v1.4s, v4.s[0] + ldr w20, [x2], #4 +fmla v10.4s, v2.4s, v4.s[0] + ldr w5, [x1, #4] + +fmla v11.4s, v0.4s, v4.s[1] + ldr w21, [x2], #4 +fmla v12.4s, v1.4s, v4.s[1] + ldr w6, [x1, #8] +fmla v13.4s, v2.4s, v4.s[1] + ldr w22, [x2], #4 + +fmla v14.4s, v0.4s, v4.s[2] + ldr w7, [x1, #12] +fmla v15.4s, v1.4s, v4.s[2] + ldr w23, [x2], #4 +fmla v16.4s, v2.4s, v4.s[2] + ldr w8, [x1, #16] +fmla v17.4s, v0.4s, v4.s[3] + ldr w24, [x2], #4 +fmla v18.4s, v1.4s, v4.s[3] + ldr w9, [x1, #20] +fmla v19.4s, v2.4s, v4.s[3] + ldr w25, [x2], #4 + +fmla v20.4s, v0.4s, v5.s[0] + ldr w10, [x1, #24] +fmla v21.4s, v1.4s, v5.s[0] + ldr w26, [x2], #4 +fmla v22.4s, v2.4s, v5.s[0] + ldr w11, [x1, #28] +fmla v23.4s, v0.4s, v5.s[1] + ldr w27, [x2], #4 +fmla v24.4s, v1.4s, v5.s[1] + ldr w12, [x1, #32] +fmla v25.4s, v2.4s, v5.s[1] + ldr w13, [x1, #36] + +fmla v26.4s, v0.4s, v5.s[2] + ldr w14, [x1, #40] +fmla v27.4s, v1.4s, v5.s[2] + ldr w15, [x1, #44] +fmla v28.4s, v2.4s, v5.s[2] + prfm pldl1keep, [x1, #512] +fmla v29.4s, v0.4s, v5.s[3] + add x1, x1, #48 +fmla v30.4s, v1.4s, v5.s[3] + prfm pldl1keep, [x2, #384] +fmla v31.4s, v2.4s, v5.s[3] + + ins v0.s[0], w4 + + ins v1.s[0], w8 + ins v2.s[0], w12 + + ins v4.s[0], w20 + ins v5.s[0], w24 + + ins v0.s[1], w5 + ins v1.s[1], w9 + + ins v2.s[1], w13 + ins v4.s[1], w21 + + ins v5.s[1], w25 + ins v0.s[2], w6 + + ins v1.s[2], w10 + ins v2.s[2], w14 + + ins v4.s[2], w22 + ins v5.s[2], w26 + + ins v0.s[3], w7 + ins v1.s[3], w11 + + ins v2.s[3], w15 + ins v4.s[3], w23 + ins v5.s[3], w27 + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.tmpli new file mode 100644 index 000000000..a380bcccf --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.tmpli @@ -0,0 +1,60 @@ +fmla v8.4s, v0.4s, v4.s[0] + ldr x4, [x1] +fmla v9.4s, v1.4s, v4.s[0] + ldr x20, [x2] +fmla v10.4s, v2.4s, v4.s[0] + ldr x5, [x1, #8] + +fmla v11.4s, v0.4s, v4.s[1] + ldr x21, [x2, #8] +fmla v12.4s, v1.4s, v4.s[1] + ldr x6, [x1, #16] +fmla v13.4s, v2.4s, v4.s[1] + ldr x22, [x2, #16] + +fmla v14.4s, v0.4s, v4.s[2] + ldr x7, [x1, #24] +fmla v15.4s, v1.4s, v4.s[2] + ldr x23, [x2, #24] +fmla v16.4s, v2.4s, v4.s[2] + ldr x8, [x1, #32] +fmla v17.4s, v0.4s, v4.s[3] +fmla v18.4s, v1.4s, v4.s[3] + ldr x9, [x1, #40] +fmla v19.4s, v2.4s, v4.s[3] + +fmla v20.4s, v0.4s, v5.s[0] +fmla v21.4s, v1.4s, v5.s[0] +fmla v22.4s, v2.4s, v5.s[0] +fmla v23.4s, v0.4s, v5.s[1] +fmla v24.4s, v1.4s, v5.s[1] +fmla v25.4s, v2.4s, v5.s[1] + +fmla v26.4s, v0.4s, v5.s[2] +fmla v27.4s, v1.4s, v5.s[2] +fmla v28.4s, v2.4s, v5.s[2] + prfm pldl1keep, [x1, #512] +fmla v29.4s, v0.4s, v5.s[3] + add x1, x1, #48 +fmla v30.4s, v1.4s, v5.s[3] + prfm pldl1keep, [x2, #384] +fmla v31.4s, v2.4s, v5.s[3] + add x2, x2, #32 + + + ins v0.d[0], x4 + ins v2.d[0], x8 + + ins v4.d[0], x20 + ins v5.d[0], x22 + + ins v0.d[1], x5 + ins v2.d[1], x9 + + ins v4.d[1], x21 + ins v1.d[0], x6 + + ins v1.d[1], x7 + + ins v5.d[1], x23 + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/naive.tmpli new file mode 100644 index 000000000..67be44dd0 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/naive.tmpli @@ -0,0 +1,34 @@ +fmla v8.4s, v0.4s, v4.s[0] +fmla v9.4s, v1.4s, v4.s[0] +fmla v10.4s, v2.4s, v4.s[0] + +fmla v11.4s, v0.4s, v4.s[1] +fmla v12.4s, v1.4s, v4.s[1] +fmla v13.4s, v2.4s, v4.s[1] + +fmla v14.4s, v0.4s, v4.s[2] +fmla v15.4s, v1.4s, v4.s[2] +fmla v16.4s, v2.4s, v4.s[2] + +fmla v17.4s, v0.4s, v4.s[3] +fmla v18.4s, v1.4s, v4.s[3] +fmla v19.4s, v2.4s, v4.s[3] + +fmla v20.4s, v0.4s, v5.s[0] +fmla v21.4s, v1.4s, v5.s[0] +fmla v22.4s, v2.4s, v5.s[0] + +fmla v23.4s, v0.4s, v5.s[1] +fmla v24.4s, v1.4s, v5.s[1] +fmla v25.4s, v2.4s, v5.s[1] + +fmla v26.4s, v0.4s, v5.s[2] +fmla v27.4s, v1.4s, v5.s[2] +fmla v28.4s, v2.4s, v5.s[2] + +fmla v29.4s, v0.4s, v5.s[3] +fmla v30.4s, v1.4s, v5.s[3] +fmla v31.4s, v2.4s, v5.s[3] + +ld1 {{ v0.4s, v1.4s, v2.4s }}, [x1], #48 +ld1 {{ v4.4s, v5.4s }}, [x2], #32 diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop2/cortex_a55.tmpli new file mode 100644 index 000000000..f1ba56c93 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop2/cortex_a55.tmpli @@ -0,0 +1,107 @@ +// mul a: v0, v1, v2, b: v4, v5 +// load a: d3/x23, d6/x26, d7/x27 +// load b: x4, x5, x6, x7 + +fmla v8.4s, v0.4s, v4.s[0] +ldr d3, [x1], #8 +fmla v9.4s, v1.4s, v4.s[0] +ldr x4, [x2], #8 +fmla v10.4s, v2.4s, v4.s[0] +ldr x23, [x1], #8 +fmla v11.4s, v0.4s, v4.s[1] +ldr x5, [x2], #8 +fmla v12.4s, v1.4s, v4.s[1] +ldr d6, [x1], #8 +fmla v13.4s, v2.4s, v4.s[1] +ldr x6, [x2], #8 +fmla v14.4s, v0.4s, v4.s[2] +ldr x26, [x1], #8 +fmla v15.4s, v1.4s, v4.s[2] +ldr x7, [x2], #8 +fmla v16.4s, v2.4s, v4.s[2] +ldr d7, [x1], #8 +fmla v17.4s, v0.4s, v4.s[3] +ldr x27, [x1], #8 +fmla v18.4s, v1.4s, v4.s[3] + +fmla v19.4s, v2.4s, v4.s[3] +fmla v20.4s, v0.4s, v5.s[0] + +// ins b: v4 <- x4/x5 +// ins a: d3/x23, d6/x26, d7/x27 + +ins v4.d[0], x4 +fmla v21.4s, v1.4s, v5.s[0] +ins v4.d[1], x5 +fmla v22.4s, v2.4s, v5.s[0] +fmla v23.4s, v0.4s, v5.s[1] + +fmla v24.4s, v1.4s, v5.s[1] +fmla v25.4s, v2.4s, v5.s[1] +fmla v26.4s, v0.4s, v5.s[2] +fmla v27.4s, v1.4s, v5.s[2] +fmla v28.4s, v2.4s, v5.s[2] +fmla v29.4s, v0.4s, v5.s[3] +ins v3.d[1], x23 +fmla v30.4s, v1.4s, v5.s[3] +ins v6.d[1], x26 +fmla v31.4s, v2.4s, v5.s[3] +ins v7.d[1], x27 + +// mul a: v3, v6, v7, b: v4, v5 +// ins b, v5 <- x6, x7 +// load a: d0/x20, d1/x21, d2/x22 +// load b: x4, x5 + +fmla v8.4s, v3.4s, v4.s[0] +ins v5.d[0], x6 +fmla v9.4s, v6.4s, v4.s[0] +ins v5.d[1], x7 +fmla v10.4s, v7.4s, v4.s[0] +ldr d0, [x1], #8 +fmla v11.4s, v3.4s, v4.s[1] +ldr x4, [x2], #8 +fmla v12.4s, v6.4s, v4.s[1] +ldr x20, [x1], #8 +fmla v13.4s, v7.4s, v4.s[1] +ldr x5, [x2], #8 +fmla v14.4s, v3.4s, v4.s[2] +ldr d1, [x1], #8 +fmla v15.4s, v6.4s, v4.s[2] +ldr x6, [x2], #8 +fmla v16.4s, v7.4s, v4.s[2] +ldr x21, [x1], #8 +fmla v17.4s, v3.4s, v4.s[3] +ldr x7, [x2], #8 + +// load b: x6, x7 +fmla v18.4s, v6.4s, v4.s[3] +ldr d2, [x1], #8 +fmla v19.4s, v7.4s, v4.s[3] +ldr x22, [x1], #8 +fmla v20.4s, v3.4s, v5.s[0] +fmla v21.4s, v6.4s, v5.s[0] +fmla v22.4s, v7.4s, v5.s[0] +fmla v23.4s, v3.4s, v5.s[1] +fmla v24.4s, v6.4s, v5.s[1] +fmla v25.4s, v7.4s, v5.s[1] + +// ins a: d0/x20, d1/x21, d2/x22 +fmla v26.4s, v3.4s, v5.s[2] +ins v0.d[1], x20 +fmla v27.4s, v6.4s, v5.s[2] +ins v1.d[1], x21 +fmla v28.4s, v7.4s, v5.s[2] +ins v2.d[1], x22 + +// ins b: v4 <- x4, x5 +fmla v29.4s, v3.4s, v5.s[3] +ins v4.d[0], x4 +fmla v30.4s, v6.4s, v5.s[3] +ins v4.d[1], x5 +fmla v31.4s, v7.4s, v5.s[3] + +// ins b: v5 <- x6, x7 +ins v5.d[0], x6 +ins v5.d[1], x7 + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8_core.tmpl new file mode 100644 index 000000000..0c5657456 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8_core.tmpl @@ -0,0 +1,163 @@ +// vim: ft=arm + +// C tile regs: +// - x19-x29 to preserve (but x19, x28, x29 not used) +// - d8..d15 to preserve +// - v16 to v31, no need to preserve +// +// v8 v11 v14 v17 v20 v23 v26 v29 +// v9 v12 v15 v18 v21 v24 v27 v30 +// v10 v13 v16 v19 v22 v25 v28 v31 + +// no preservation for v0-v7: +// packed A buffering (2x8 values): rotating over v0..v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +.cpu generic+fp+simd +.global {{G}}arm64simd_mmm_f32_12x8_{{core}}_{{suffix}} +{{G}}arm64simd_mmm_f32_12x8_{{core}}_{{suffix}}: + + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + stp x26, x27, [sp, #-16]! + + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldr x2, [x0, #24] // b + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + + ld1 { v0.4s, v1.4s, v2.4s }, [ x1 ], #48 + ld1 { v4.4s, v5.4s }, [ x2 ], #32 + +{% capture packed_packed_loop1 %} + {% if core == "a53" %} + {% include "arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.tmpli" %} + {% else %} + {% include "arm64simd_mmm_f32_12x8/packed_packed_loop1/naive.tmpli" %} + {% endif %} +{% endcapture %} + +{% capture packed_packed_loop2 %} + {% if core == "a55" %} + {% include "arm64simd_mmm_f32_12x8/packed_packed_loop2/cortex_a55.tmpli" %} + {% else %} + {{ packed_packed_loop1 }} + {{ packed_packed_loop1 }} + {% endif %} +{% endcapture %} + + cmp x3, #4 + blt .packed_packed_loop_1 + +.p2align 4 +.packed_packed_loop_4: + {{ packed_packed_loop2 }} + {{ packed_packed_loop2 }} + + sub x3, x3, #4 + cmp x3, #4 + bge .packed_packed_loop_4 + + cmp x3, #0 + beq .non_linear_loop + +.p2align 4 +.packed_packed_loop_1: + {{ packed_packed_loop1 }} + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +{% include "arm64simd_mmm_f32_scalars.tmpliq" from:8, to:31%} +{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:12, from:8, to:31 %} +{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:12, from:8, to:31 %} +{% include "arm64simd_mmm_load_tile.tmpliq" from:8, to:31 %} + +.add_unicast: + ldp x5, x6, [x0, #8 ] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + {% for col in (0..7) %} + mov x4, x5 + {% for reg in (0..2) %} + {% for lane in (0..3) %} + ld1 {v0.s}[{{lane}}], [ x4 ], x6 + {% endfor %} + fadd v{{col | times:3 | plus: 8| plus: reg}}.4s, v{{col | times:3 | plus: 8 | plus: reg}}.4s, v0.4s + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldr x2, [x0, #8] + ldr x3, [x0, #16] + + ld1 { v0.4s, v1.4s, v2.4s }, [ x2 ] + ld1 { v4.4s, v5.4s }, [ x3 ] + + {% for col in (0..7) %} + {% for reg in (0..2) %} + fmla v{{col | times:3 | plus: 8 | plus: reg}}.4s, v{{reg}}.4s, v{{col| divided_by:4 | plus: 4}}.s[{{col| modulo: 4}}] + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + cmp x6, #4 + bne .store_strides_generic + + {% for col in (0..7) %} + str q{{col | times:3 | plus: 8 }}, [ x5 ] + str q{{col | times:3 | plus: 9}}, [ x5, #16 ] + str q{{col | times:3 | plus: 10}}, [ x5, #32 ] + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.store_strides_generic: + {% for col in (0..7) %} + mov x4, x5 + {% for reg in (0..2) %} + {% for lane in (0..3) %} + st1 { v{{col | times:3 | plus: 8 | plus: reg}}.s }[{{lane}}], [ x4 ], x6 + {% endfor %} + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.return: + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ldp x26, x27, [sp], #16 + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.tmpli new file mode 100644 index 000000000..2ae7a54eb --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.tmpli @@ -0,0 +1,45 @@ +fmla v16.4s, v0.4s, v4.s[0] +ldr x5, [x1] +fmla v17.4s, v1.4s, v4.s[0] +ldr x6, [x1, #8] +fmla v18.4s, v2.4s, v4.s[0] +ldr x7, [x1, #16] +fmla v19.4s, v3.4s, v4.s[0] +ldr x8, [x1, #24] +fmla v20.4s, v0.4s, v4.s[1] +ldr x9, [x1, #32] +fmla v21.4s, v1.4s, v4.s[1] +ldr x10, [x1, #40] +fmla v22.4s, v2.4s, v4.s[1] +ldr x11, [x1, #48] +fmla v23.4s, v3.4s, v4.s[1] +ldr x12, [x1, #56] + +fmla v24.4s, v0.4s, v4.s[2] +ldr x24, [x2] +fmla v25.4s, v1.4s, v4.s[2] +ldr x25, [x2, #8] +fmla v26.4s, v2.4s, v4.s[2] +add x1, x1, #64 +fmla v27.4s, v3.4s, v4.s[2] +add x2, x2, #16 +fmla v28.4s, v0.4s, v4.s[3] +prfm pldl1keep, [x1, #256] +fmla v29.4s, v1.4s, v4.s[3] +prfm pldl1keep, [x2, #256] +fmla v30.4s, v2.4s, v4.s[3] +prfm pldl1keep, [x1, #256] +fmla v31.4s, v3.4s, v4.s[3] + +ins v0.d[0], x5 +ins v2.d[0], x9 +ins v1.d[0], x7 +ins v3.d[0], x11 +ins v4.d[0], x24 + +ins v0.d[1], x6 +ins v2.d[1], x10 +ins v1.d[1], x8 +ins v3.d[1], x12 +ins v4.d[1], x25 + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/naive.tmpli new file mode 100644 index 000000000..637466515 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/naive.tmpli @@ -0,0 +1,21 @@ + +fmla v16.4s, v0.4s, v4.s[0] +fmla v17.4s, v1.4s, v4.s[0] +fmla v18.4s, v2.4s, v4.s[0] +fmla v19.4s, v3.4s, v4.s[0] +fmla v20.4s, v0.4s, v4.s[1] +fmla v21.4s, v1.4s, v4.s[1] +fmla v22.4s, v2.4s, v4.s[1] +fmla v23.4s, v3.4s, v4.s[1] + +fmla v24.4s, v0.4s, v4.s[2] +fmla v25.4s, v1.4s, v4.s[2] +fmla v26.4s, v2.4s, v4.s[2] +fmla v27.4s, v3.4s, v4.s[2] +fmla v28.4s, v0.4s, v4.s[3] +fmla v29.4s, v1.4s, v4.s[3] +fmla v30.4s, v2.4s, v4.s[3] +fmla v31.4s, v3.4s, v4.s[3] + +ld1 {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [ x1 ], #64 +ld1 {{ v4.4s }}, [ x2 ], #16 diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop2/cortex_a55.tmpli new file mode 100644 index 000000000..c0b2f502c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop2/cortex_a55.tmpli @@ -0,0 +1,73 @@ +// mul a: v0, v1, v2, v3 b: v4 +// load a: v5(d5/x5), v6(d6,x6), v7(d7,x7), v8(d8, x8) +// load b: v9(d9/x9) + +fmla v16.4s, v0.4s, v4.s[0] +ldr d5, [x1], #8 +fmla v17.4s, v1.4s, v4.s[0] +ldr d9, [x2], #8 +fmla v18.4s, v2.4s, v4.s[0] +ldr x5, [x1], #8 +fmla v19.4s, v3.4s, v4.s[0] +ldr x9, [x2], #8 +fmla v20.4s, v0.4s, v4.s[1] +ldr d6, [x1], #8 +fmla v21.4s, v1.4s, v4.s[1] +ldr x6, [x1], #8 +fmla v22.4s, v2.4s, v4.s[1] +ldr d7, [x1], #8 +fmla v23.4s, v3.4s, v4.s[1] +ldr x7, [x1], #8 + +fmla v24.4s, v0.4s, v4.s[2] +ldr d8, [x1], #8 +fmla v25.4s, v1.4s, v4.s[2] +ldr x8, [x1], #8 +fmla v26.4s, v2.4s, v4.s[2] +ins v5.d[1], x5 +fmla v27.4s, v3.4s, v4.s[2] +ins v6.d[1], x6 +fmla v28.4s, v0.4s, v4.s[3] +ins v7.d[1], x7 +fmla v29.4s, v1.4s, v4.s[3] +ins v8.d[1], x8 +fmla v30.4s, v2.4s, v4.s[3] +ins v9.d[1], x9 +fmla v31.4s, v3.4s, v4.s[3] + +// mul a: v5, v6, v7, v8 b: v9 +// load a: v0(d0/x5), v1(d1,x6), v2(d2,x7), v3(d3, x8) +// load b: v4(d4/x9) + +fmla v16.4s, v5.4s, v9.s[0] +ldr d0, [x1], #8 +fmla v17.4s, v6.4s, v9.s[0] +ldr d4, [x2], #8 +fmla v18.4s, v7.4s, v9.s[0] +ldr x5, [x1], #8 +fmla v19.4s, v8.4s, v9.s[0] +ldr x9, [x2], #8 +fmla v20.4s, v5.4s, v9.s[1] +ldr d1, [x1], #8 +fmla v21.4s, v6.4s, v9.s[1] +ldr x6, [x1], #8 +fmla v22.4s, v7.4s, v9.s[1] +ldr d2, [x1], #8 +fmla v23.4s, v8.4s, v9.s[1] +ldr x7, [x1], #8 + +fmla v24.4s, v5.4s, v9.s[2] +ldr d3, [x1], #8 +fmla v25.4s, v6.4s, v9.s[2] +ldr x8, [x1], #8 +fmla v26.4s, v7.4s, v9.s[2] +ins v0.d[1], x5 +fmla v27.4s, v8.4s, v9.s[2] +ins v1.d[1], x6 +fmla v28.4s, v5.4s, v9.s[3] +ins v2.d[1], x7 +fmla v29.4s, v6.4s, v9.s[3] +ins v3.d[1], x8 +fmla v30.4s, v7.4s, v9.s[3] +ins v4.d[1], x9 +fmla v31.4s, v8.4s, v9.s[3] diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4_core.tmpl new file mode 100644 index 000000000..cb8b6e533 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4_core.tmpl @@ -0,0 +1,174 @@ +// vim: ft=arm + +// C tile regs: v16 to v31, (scratch) +// - x19-x29 to preserve (but x19, x28, x29 not used) +// - d8..d15 to preserve +// - v16 to v31, no need to preserve + +// v8 is used, d8 (lower half) must preserved +// v0-v7 (scratch registers) +// packed A buffering (2x8 values): alternating v0, v1 with v2, v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +.cpu generic+fp+simd +.global {{G}}arm64simd_mmm_f32_16x4_{{core}}_{{suffix}} +{{G}}arm64simd_mmm_f32_16x4_{{core}}_{{suffix}}: + + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + stp x26, x27, [sp, #-16]! + + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldr x2, [x0, #24] // b + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64 + ld1 { v4.4s }, [ x2 ], #16 + +{% capture packed_packed_loop1 %} + {% if core == "a53" %} + {% include "arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.tmpli" %} + {% else %} + {% include "arm64simd_mmm_f32_16x4/packed_packed_loop1/naive.tmpli" %} + {% endif %} +{% endcapture %} + +{% capture packed_packed_loop2 %} + {% if core == "a55" %} + {% include "arm64simd_mmm_f32_16x4/packed_packed_loop2/cortex_a55.tmpli" %} + {% else %} + {{ packed_packed_loop1 }} + {{ packed_packed_loop1 }} + {% endif %} +{% endcapture %} + + cmp x3, #4 + blt .packed_packed_loop_1 + +.p2align 4 +.packed_packed_loop_4: + {{ packed_packed_loop2 }} + {{ packed_packed_loop2 }} + + sub x3, x3, #4 + cmp x3, #4 + bge .packed_packed_loop_4 + + cmp x3, #0 + beq .non_linear_loop + +.p2align 4 +.packed_packed_loop_1: + {{ packed_packed_loop1 }} + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +{% include "arm64simd_mmm_f32_scalars.tmpliq" from:16, to:31%} +{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:16, from:16, to:31 %} +{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:16, from:16, to:31 %} +{% include "arm64simd_mmm_load_tile.tmpliq" from:16, to:31 %} + +.add_unicast: + ldp x5, x6, [x0, #8] + ldp x7, x8, [x0, #24] + + {% for col in (0..3) %} + mov x4, x5 + {% for reg in (0..3) %} + {% for lane in (0..3) %} + ld1 {v0.s}[{{lane}}], [ x4 ], x6 + {% endfor %} + fadd v{{col | times:4 | plus: 16| plus: reg}}.4s, v{{col | times:4 | plus: 16 | plus: reg}}.4s, v0.4s + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldr x2, [x0, #8] + ldr x3, [x0, #16] + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x2 ] + ld1 { v4.4s }, [ x3 ] + + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v2.4s, v4.s[0] + fmla v19.4s, v3.4s, v4.s[0] + fmla v20.4s, v0.4s, v4.s[1] + fmla v21.4s, v1.4s, v4.s[1] + fmla v22.4s, v2.4s, v4.s[1] + fmla v23.4s, v3.4s, v4.s[1] + + fmla v24.4s, v0.4s, v4.s[2] + fmla v25.4s, v1.4s, v4.s[2] + fmla v26.4s, v2.4s, v4.s[2] + fmla v27.4s, v3.4s, v4.s[2] + fmla v28.4s, v0.4s, v4.s[3] + fmla v29.4s, v1.4s, v4.s[3] + fmla v30.4s, v2.4s, v4.s[3] + fmla v31.4s, v3.4s, v4.s[3] + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + cmp x6, #4 + bne .store_strides_generic + + {% for col in (0..3) %} + str q{{col | times:4 | plus:16 | plus: 0}}, [ x5 ] + str q{{col | times:4 | plus:16 | plus: 1}}, [ x5, #16 ] + str q{{col | times:4 | plus:16 | plus: 2}}, [ x5, #32 ] + str q{{col | times:4 | plus:16 | plus: 3}}, [ x5, #48 ] + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.store_strides_generic: + + {% for col in (0..3) %} + mov x4, x5 + {% for reg in (0..3) %} + {% for lane in (0..3) %} + st1 { v{{col | times:4 | plus: 16 | plus: reg}}.s }[{{lane}}], [ x4 ], x6 + {% endfor %} + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.return: + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ldp x26, x27, [sp], #16 + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/loop2/cortex_a55.tmpli new file mode 100644 index 000000000..b27363856 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/loop2/cortex_a55.tmpli @@ -0,0 +1,73 @@ +// mul a: v0, v1, v2, v3, v4, v5 b: v7 +// load a: v5(d5/x5), v6(d6,x6), v7(d7,x7), v8(d8, x8) +// load b: v9(d9/x9) + +fmla v16.4s, v0.4s, v4.s[0] +ldr d5, [x1], #8 +fmla v17.4s, v1.4s, v4.s[0] +ldr d9, [x2], #8 +fmla v18.4s, v2.4s, v4.s[0] +ldr x5, [x1], #8 +fmla v19.4s, v3.4s, v4.s[0] +ldr x9, [x2], #8 +fmla v20.4s, v0.4s, v4.s[1] +ldr d6, [x1], #8 +fmla v21.4s, v1.4s, v4.s[1] +ldr x6, [x1], #8 +fmla v22.4s, v2.4s, v4.s[1] +ldr d7, [x1], #8 +fmla v23.4s, v3.4s, v4.s[1] +ldr x7, [x1], #8 + +fmla v24.4s, v0.4s, v4.s[2] +ldr d8, [x1], #8 +fmla v25.4s, v1.4s, v4.s[2] +ldr x8, [x1], #8 +fmla v26.4s, v2.4s, v4.s[2] +ins v5.d[1], x5 +fmla v27.4s, v3.4s, v4.s[2] +ins v6.d[1], x6 +fmla v28.4s, v0.4s, v4.s[3] +ins v7.d[1], x7 +fmla v29.4s, v1.4s, v4.s[3] +ins v8.d[1], x8 +fmla v30.4s, v2.4s, v4.s[3] +ins v9.d[1], x9 +fmla v31.4s, v3.4s, v4.s[3] + +// mul a: v5, v6, v7, v8 b: v9 +// load a: v0(d0/x5), v1(d1,x6), v2(d2,x7), v3(d3, x8) +// load b: v4(d4/x9) + +fmla v16.4s, v5.4s, v9.s[0] +ldr d0, [x1], #8 +fmla v17.4s, v6.4s, v9.s[0] +ldr d4, [x2], #8 +fmla v18.4s, v7.4s, v9.s[0] +ldr x5, [x1], #8 +fmla v19.4s, v8.4s, v9.s[0] +ldr x9, [x2], #8 +fmla v20.4s, v5.4s, v9.s[1] +ldr d1, [x1], #8 +fmla v21.4s, v6.4s, v9.s[1] +ldr x6, [x1], #8 +fmla v22.4s, v7.4s, v9.s[1] +ldr d2, [x1], #8 +fmla v23.4s, v8.4s, v9.s[1] +ldr x7, [x1], #8 + +fmla v24.4s, v5.4s, v9.s[2] +ldr d3, [x1], #8 +fmla v25.4s, v6.4s, v9.s[2] +ldr x8, [x1], #8 +fmla v26.4s, v7.4s, v9.s[2] +ins v0.d[1], x5 +fmla v27.4s, v8.4s, v9.s[2] +ins v1.d[1], x6 +fmla v28.4s, v5.4s, v9.s[3] +ins v2.d[1], x7 +fmla v29.4s, v6.4s, v9.s[3] +ins v3.d[1], x8 +fmla v30.4s, v7.4s, v9.s[3] +ins v4.d[1], x9 +fmla v31.4s, v8.4s, v9.s[3] diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.tmpli new file mode 100644 index 000000000..6742473ea --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.tmpli @@ -0,0 +1,63 @@ + +fmla v8.4s, v0.4s, v7.s[0] + ldr x4, [x1] +fmla v9.4s, v1.4s, v7.s[0] + ldr x5, [x1, #8] +fmla v10.4s, v2.4s, v7.s[0] + ldr x6, [x1, #16] +fmla v11.4s, v3.4s, v7.s[0] + ldr x7, [x1, #24] +fmla v12.4s, v4.4s, v7.s[0] + ldr x8, [x1, #32] +fmla v13.4s, v5.4s, v7.s[0] + ldr x9, [x1, #40] + +fmla v14.4s, v0.4s, v7.s[1] + ldr x10, [x1, #48] +fmla v15.4s, v1.4s, v7.s[1] + ldr x11, [x1, #56] +fmla v16.4s, v2.4s, v7.s[1] + ldr x12, [x1, #64] +fmla v17.4s, v3.4s, v7.s[1] + ldr x13, [x1, #72] +fmla v18.4s, v4.4s, v7.s[1] + ldr x14, [x1, #80] +fmla v19.4s, v5.4s, v7.s[1] + ldr x15, [x1, #88] + +fmla v20.4s, v0.4s, v7.s[2] + ldr x20, [x2] +fmla v21.4s, v1.4s, v7.s[2] + ldr x21, [x2, #8] +fmla v22.4s, v2.4s, v7.s[2] + add x1, x1, #96 +fmla v23.4s, v3.4s, v7.s[2] + add x2, x2, #16 +fmla v24.4s, v4.4s, v7.s[2] + prfm pldl1keep, [x1, #256] +fmla v25.4s, v5.4s, v7.s[2] + prfm pldl1keep, [x2, #256] + +fmla v26.4s, v0.4s, v7.s[3] + prfm pldl1keep, [x1, #320] +fmla v27.4s, v1.4s, v7.s[3] +fmla v28.4s, v2.4s, v7.s[3] +fmla v29.4s, v3.4s, v7.s[3] +fmla v30.4s, v4.4s, v7.s[3] +fmla v31.4s, v5.4s, v7.s[3] + +ins v0.d[0], x4 +ins v1.d[0], x6 +ins v2.d[0], x8 +ins v3.d[0], x10 +ins v4.d[0], x12 +ins v5.d[0], x14 +ins v7.d[0], x20 + +ins v0.d[1], x5 +ins v1.d[1], x7 +ins v2.d[1], x9 +ins v3.d[1], x11 +ins v4.d[1], x13 +ins v5.d[1], x15 +ins v7.d[1], x21 diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a55.tmpli new file mode 100644 index 000000000..93307bf80 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a55.tmpli @@ -0,0 +1,53 @@ +fmla v8.4s, v0.4s, v7.s[0] +fmla v14.4s, v0.4s, v7.s[1] + prfm pldl1keep, [x2, #256] +fmla v20.4s, v0.4s, v7.s[2] +fmla v26.4s, v0.4s, v7.s[3] + ldr d0, [x1], #8 +fmla v9.4s, v1.4s, v7.s[0] + ldr x5, [x1], #8 +fmla v15.4s, v1.4s, v7.s[1] + ldr x20, [x2], #8 +fmla v21.4s, v1.4s, v7.s[2] + ldr x21, [x2], #8 +fmla v27.4s, v1.4s, v7.s[3] + ldr d1, [x1], #8 +fmla v10.4s, v2.4s, v7.s[0] + ldr x7, [x1], #8 +fmla v16.4s, v2.4s, v7.s[1] + prfm pldl1keep, [x1, #256] +fmla v22.4s, v2.4s, v7.s[2] + prfm pldl1keep, [x1, #320] +fmla v28.4s, v2.4s, v7.s[3] + ldr d2, [x1], #8 +fmla v11.4s, v3.4s, v7.s[0] + ldr x9, [x1], #8 +fmla v17.4s, v3.4s, v7.s[1] + ins v0.d[1], x5 +fmla v23.4s, v3.4s, v7.s[2] + ins v1.d[1], x7 +fmla v29.4s, v3.4s, v7.s[3] + ldr d3, [x1], #8 +fmla v12.4s, v4.4s, v7.s[0] + ldr x11, [x1], #8 +fmla v18.4s, v4.4s, v7.s[1] + ins v2.d[1], x9 +fmla v24.4s, v4.4s, v7.s[2] +fmla v30.4s, v4.4s, v7.s[3] + ldr d4, [x1], #8 +fmla v13.4s, v5.4s, v7.s[0] + ldr x13, [x1], #8 +fmla v19.4s, v5.4s, v7.s[1] + ldr x14, [x1], #8 +fmla v25.4s, v5.4s, v7.s[2] + ldr x15, [x1], #8 +fmla v31.4s, v5.4s, v7.s[3] + +ins v7.d[0], x20 +ins v7.d[1], x21 + +ins v5.d[0], x14 +ins v5.d[1], x15 + +ins v3.d[1], x11 +ins v4.d[1], x13 diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/naive.tmpli new file mode 100644 index 000000000..7b12d7443 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/naive.tmpli @@ -0,0 +1,31 @@ +fmla v8.4s, v0.4s, v7.s[0] +fmla v9.4s, v1.4s, v7.s[0] +fmla v10.4s, v2.4s, v7.s[0] +fmla v11.4s, v3.4s, v7.s[0] +fmla v12.4s, v4.4s, v7.s[0] +fmla v13.4s, v5.4s, v7.s[0] + +fmla v14.4s, v0.4s, v7.s[1] +fmla v15.4s, v1.4s, v7.s[1] +fmla v16.4s, v2.4s, v7.s[1] +fmla v17.4s, v3.4s, v7.s[1] +fmla v18.4s, v4.4s, v7.s[1] +fmla v19.4s, v5.4s, v7.s[1] + +fmla v20.4s, v0.4s, v7.s[2] +fmla v21.4s, v1.4s, v7.s[2] +fmla v22.4s, v2.4s, v7.s[2] +fmla v23.4s, v3.4s, v7.s[2] +fmla v24.4s, v4.4s, v7.s[2] +fmla v25.4s, v5.4s, v7.s[2] + +fmla v26.4s, v0.4s, v7.s[3] +fmla v27.4s, v1.4s, v7.s[3] +fmla v28.4s, v2.4s, v7.s[3] +fmla v29.4s, v3.4s, v7.s[3] +fmla v30.4s, v4.4s, v7.s[3] +fmla v31.4s, v5.4s, v7.s[3] + +ld1 {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [ x1 ], #64 +ld1 {{ v4.4s, v5.4s }}, [ x1 ], #32 +ld1 {{ v7.4s }}, [ x2 ], #16 diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4_core.tmpl new file mode 100644 index 000000000..f77fbd3fd --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4_core.tmpl @@ -0,0 +1,185 @@ +// vim: ft=arm + +// x20..x27 are used, callee-preserved + +// C tile regs: v8 to v31, (scratch) +// - x19-x29 to preserve (but x19, x28, x29 not used) +// - d8..d15 to preserve +// - v16 to v31, no need to preserve + +// v8 is used, d8 (lower half) must preserved +// v0-v7 (scratch registers) +// packed A buffering (2x8 values): alternating v0, v1 with v2, v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +.cpu generic+fp+simd +.global {{G}}arm64simd_mmm_f32_24x4_{{core}}_{{suffix}} +{{G}}arm64simd_mmm_f32_24x4_{{core}}_{{suffix}}: + + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + stp x26, x27, [sp, #-16]! + + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldr x2, [x0, #24] // b + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64 + ld1 { v4.4s, v5.4s }, [ x1 ], #32 + ld1 { v7.4s }, [ x2 ], #16 + +{% capture packed_packed_loop1 %} + {% if core == "a53" %} + {% include "arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.tmpli" %} + {% elsif core == "a55" %} + {% include "arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a55.tmpli" %} + {% else %} + {% include "arm64simd_mmm_f32_24x4/packed_packed_loop1/naive.tmpli" %} + {% endif %} +{% endcapture %} + + cmp x3, #4 + blt .packed_packed_loop_1 + +.p2align 4 +.packed_packed_loop_4: + {{ packed_packed_loop1 }} + {{ packed_packed_loop1 }} + {{ packed_packed_loop1 }} + {{ packed_packed_loop1 }} + + sub x3, x3, #4 + cmp x3, #4 + bge .packed_packed_loop_4 + + cmp x3, #0 + beq .non_linear_loop + +.p2align 4 +.packed_packed_loop_1: + {{ packed_packed_loop1 }} + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +{% include "arm64simd_mmm_f32_scalars.tmpliq" from:8, to:31 %} +{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:24, from:8, to:31 %} +{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:24, from:8, to:31 %} +{% include "arm64simd_mmm_load_tile.tmpliq" from:8, to:31 %} + +.add_unicast: + ldp x5, x6, [x0, #8] + ldp x7, x8, [x0, #24] + + {% for col in (0..3) %} + mov x4, x5 + {% for reg in (0..5) %} + {% for lane in (0..3) %} + ld1 {v0.s}[{{lane}}], [ x4 ], x6 + {% endfor %} + fadd v{{col | times:6 | plus: 8 | plus: reg}}.4s, v{{col | times:6 | plus: 8 | plus: reg}}.4s, v0.4s + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldr x2, [x0, #8] + ldr x3, [x0, #16] + + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x2 ], #64 + ld1 { v7.4s }, [ x3 ] + ld1 { v4.4s, v5.4s }, [ x2 ] + + fmla v8.4s, v0.4s, v7.s[0] + fmla v9.4s, v1.4s, v7.s[0] + fmla v10.4s, v2.4s, v7.s[0] + fmla v11.4s, v3.4s, v7.s[0] + fmla v12.4s, v4.4s, v7.s[0] + fmla v13.4s, v5.4s, v7.s[0] + + fmla v14.4s, v0.4s, v7.s[1] + fmla v15.4s, v1.4s, v7.s[1] + fmla v16.4s, v2.4s, v7.s[1] + fmla v17.4s, v3.4s, v7.s[1] + fmla v18.4s, v4.4s, v7.s[1] + fmla v19.4s, v5.4s, v7.s[1] + + fmla v20.4s, v0.4s, v7.s[2] + fmla v21.4s, v1.4s, v7.s[2] + fmla v22.4s, v2.4s, v7.s[2] + fmla v23.4s, v3.4s, v7.s[2] + fmla v24.4s, v4.4s, v7.s[2] + fmla v25.4s, v5.4s, v7.s[2] + + fmla v26.4s, v0.4s, v7.s[3] + fmla v27.4s, v1.4s, v7.s[3] + fmla v28.4s, v2.4s, v7.s[3] + fmla v29.4s, v3.4s, v7.s[3] + fmla v30.4s, v4.4s, v7.s[3] + fmla v31.4s, v5.4s, v7.s[3] + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + cmp x6, #4 + bne .store_strides_generic + + {% for col in (0..3) %} + str q{{col | times:6 | plus:8 | plus: 0}}, [ x5 ] + str q{{col | times:6 | plus:8 | plus: 1}}, [ x5, #16 ] + str q{{col | times:6 | plus:8 | plus: 2}}, [ x5, #32 ] + str q{{col | times:6 | plus:8 | plus: 3}}, [ x5, #48 ] + str q{{col | times:6 | plus:8 | plus: 4}}, [ x5, #64 ] + str q{{col | times:6 | plus:8 | plus: 5}}, [ x5, #80 ] + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.store_strides_generic: + + {% for col in (0..3) %} + mov x4, x5 + {% for reg in (0..5) %} + {% for lane in (0..3) %} + st1 { v{{col | times:6 | plus:8 | plus: reg}}.s }[{{lane}}], [ x4 ], x6 + {% endfor %} + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.return: + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ldp x26, x27, [sp], #16 + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x1_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x1_core.tmpl new file mode 100644 index 000000000..336248a3c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x1_core.tmpl @@ -0,0 +1,403 @@ +// vim: ft=arm + +// C tile regs: +// - x19-x29 to preserve (but x19, x28, x29 not used) +// - d8..d15 to preserve +// - v16 to v31, no need to preserve +// +// v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0] +// v16[1] v18[1] +// v16[2] v18[2] +// v16[3] v18[3] +// +// v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0] +// v17[1] v19[1] +// v17[2] v19[2] +// v17[3] v19[3] + +// packed A buffering (2x8 values): alternating v0, v1 with v2, v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +.cpu generic+fp+simd +.global {{G}}arm64simd_mmm_f32_32x1_{{core}}_{{suffix}} +{{G}}arm64simd_mmm_f32_32x1_{{core}}_{{suffix}}: + + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + stp x26, x27, [sp, #-16]! + + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldp x2, x4, [x0, #24] // b, packing + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + + cmp x4, #1 + beq .q4f16se + cmp x4, #2 + beq .q4f32se + cmp x4, #3 + beq .f16f16 + cmp x4, #4 + beq .f32f16 + cmp x4, #5 + beq .f16f32 + + sub x3, x3, #1 + +.p2align 4 +.packed_packed_loop_1: + ld1 { v8.s }[0], [ x2 ], #4 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64 + ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [ x1 ], #64 + + fmla v24.4s, v0.4s, v8.s[0] + fmla v25.4s, v1.4s, v8.s[0] + fmla v26.4s, v2.4s, v8.s[0] + fmla v27.4s, v3.4s, v8.s[0] + fmla v28.4s, v4.4s, v8.s[0] + fmla v29.4s, v5.4s, v8.s[0] + fmla v30.4s, v6.4s, v8.s[0] + fmla v31.4s, v7.4s, v8.s[0] + + subs x3, x3, #1 + bge .packed_packed_loop_1 + + b .non_linear_loop + +.p2align 8 +.q40f16_const: + .byte 0xc8, 0xc7, 0xc6, 0xc5, 0xc4, 0xc2, 0xc0, 0xbc + .byte 0x00, 0x3c, 0x40, 0x42, 0x44, 0x45, 0x46, 0x47 + +.q4f16se: + adr x4, .q40f16_const + movi v15.16b, 15 + ld1 {v13.16b}, [ x4 ] + eor v12.16b, v12.16b, v12.16b + +.q4f16se_outerloop: +{% for i in (0..7) %} + eor v{{i|plus:16}}.16b, v{{i|plus:16}}.16b, v{{i|plus:16}}.16b +{% endfor %} + mov x4, #32 + +.p2align 4 +.q4f16se_innerloop: + ld1 { v10.16b }, [ x1 ], #16 + ld1 { v11.h }[0], [ x2 ], #2 + + and v9.16b, v10.16b, v15.16b + ushr v10.16b, v10.16b, 4 + + tbl v9.16b, { v13.16b }, v9.16b + tbl v10.16b, { v13.16b }, v10.16b + + zip1 v0.16b, v12.16b, v9.16b + zip2 v2.16b, v12.16b, v9.16b + zip1 v4.16b, v12.16b, v10.16b + zip2 v6.16b, v12.16b, v10.16b + + fcvtl v11.4s, v11.4h + + fcvtl2 v1.4s, v0.8h + fcvtl2 v3.4s, v2.8h + fcvtl2 v5.4s, v4.8h + fcvtl2 v7.4s, v6.8h + fcvtl v0.4s, v0.4h + fcvtl v2.4s, v2.4h + fcvtl v4.4s, v4.4h + fcvtl v6.4s, v6.4h + +{% for i in (0..7) %} + fmla v{{ i|plus: 16 }}.4s, v{{i}}.4s, v11.s[0] +{% endfor %} + + subs x4, x4, #1 + bne .q4f16se_innerloop + + // scales + ld1 { v0.8h-v3.8h }, [ x1 ], #64 + + fcvtl v4.4s, v0.4h + fcvtl2 v5.4s, v0.8h + fcvtl v6.4s, v1.4h + fcvtl2 v7.4s, v1.8h + fcvtl v8.4s, v2.4h + fcvtl2 v9.4s, v2.8h + fcvtl v10.4s, v3.4h + fcvtl2 v11.4s, v3.8h + +{% for i in (0..7) %} + fmla v{{i|plus:24}}.4s, v{{i|plus:4}}.4s, v{{i|plus:16}}.4s +{% endfor %} + + subs x3, x3, #32 + bne .q4f16se_outerloop + + b .non_linear_loop + +.q4f32se: + adr x4, .q40f16_const + movi v15.16b, 15 + ld1 {v13.16b}, [ x4 ] + eor v12.16b, v12.16b, v12.16b + +.q4f32se_outerloop: +{% for i in (0..7) %} + eor v{{i|plus:16}}.16b, v{{i|plus:16}}.16b, v{{i|plus:16}}.16b +{% endfor %} + mov x4, #32 + +.p2align 4 +.q4f32se_innerloop: + ld1 { v10.16b }, [ x1 ], #16 + ld1 { v11.s }[0], [ x2 ], #4 + + and v9.16b, v10.16b, v15.16b + ushr v10.16b, v10.16b, 4 + + tbl v9.16b, { v13.16b }, v9.16b + tbl v10.16b, { v13.16b }, v10.16b + + zip1 v0.16b, v12.16b, v9.16b + zip2 v2.16b, v12.16b, v9.16b + zip1 v4.16b, v12.16b, v10.16b + zip2 v6.16b, v12.16b, v10.16b + + fcvtl2 v1.4s, v0.8h + fcvtl2 v3.4s, v2.8h + fcvtl2 v5.4s, v4.8h + fcvtl2 v7.4s, v6.8h + fcvtl v0.4s, v0.4h + fcvtl v2.4s, v2.4h + fcvtl v4.4s, v4.4h + fcvtl v6.4s, v6.4h + +{% for i in (0..7) %} + fmla v{{ i|plus: 16 }}.4s, v{{i}}.4s, v11.s[0] +{% endfor %} + + subs x4, x4, #1 + bne .q4f32se_innerloop + + // scales + ld1 { v0.8h-v3.8h }, [ x1 ], #64 + + fcvtl v4.4s, v0.4h + fcvtl2 v5.4s, v0.8h + fcvtl v6.4s, v1.4h + fcvtl2 v7.4s, v1.8h + fcvtl v8.4s, v2.4h + fcvtl2 v9.4s, v2.8h + fcvtl v10.4s, v3.4h + fcvtl2 v11.4s, v3.8h + +{% for i in (0..7) %} + fmla v{{i|plus:24}}.4s, v{{i|plus:4}}.4s, v{{i|plus:16}}.4s +{% endfor %} + + subs x3, x3, #32 + bne .q4f32se_outerloop + + b .non_linear_loop + +.p2align 4 +.f16f16: + sub x3, x3, #1 +.f16f16_loop: + ld1 { v9.h }[0], [ x2 ], #2 + ld1 { v10.8h-v13.8h }, [ x1 ], #64 + + fcvtl v8.4s, v9.4h + {% for reg in (0..3) %} + fcvtl v{{reg|times:2}}.4s, v{{reg|plus:10}}.4h + fcvtl2 v{{reg|times:2|plus:1}}.4s, v{{reg|plus:10}}.8h + {% endfor %} + + fmla v24.4s, v0.4s, v8.s[0] + fmla v25.4s, v1.4s, v8.s[0] + fmla v26.4s, v2.4s, v8.s[0] + fmla v27.4s, v3.4s, v8.s[0] + fmla v28.4s, v4.4s, v8.s[0] + fmla v29.4s, v5.4s, v8.s[0] + fmla v30.4s, v6.4s, v8.s[0] + fmla v31.4s, v7.4s, v8.s[0] + + subs x3, x3, #1 + bge .f16f16_loop + + b .non_linear_loop + +.p2align 4 +.f32f16: + sub x3, x3, #1 +.f32f16_loop: + ld1 { v9.h }[0], [ x2 ], #2 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64 + ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [ x1 ], #64 + + fcvtl v8.4s, v9.4h + + fmla v24.4s, v0.4s, v8.s[0] + fmla v25.4s, v1.4s, v8.s[0] + fmla v26.4s, v2.4s, v8.s[0] + fmla v27.4s, v3.4s, v8.s[0] + fmla v28.4s, v4.4s, v8.s[0] + fmla v29.4s, v5.4s, v8.s[0] + fmla v30.4s, v6.4s, v8.s[0] + fmla v31.4s, v7.4s, v8.s[0] + + subs x3, x3, #1 + bge .f32f16_loop + + b .non_linear_loop + +.p2align 4 +.f16f32: + sub x3, x3, #1 +.f16f32_loop: + ld1 { v8.s }[0], [ x2 ], #4 + ld1 { v10.8h-v13.8h }, [ x1 ], #64 + + {% for reg in (0..3) %} + fcvtl v{{reg|times:2}}.4s, v{{reg|plus:10}}.4h + fcvtl2 v{{reg|times:2|plus:1}}.4s, v{{reg|plus:10}}.8h + {% endfor %} + + fmla v24.4s, v0.4s, v8.s[0] + fmla v25.4s, v1.4s, v8.s[0] + fmla v26.4s, v2.4s, v8.s[0] + fmla v27.4s, v3.4s, v8.s[0] + fmla v28.4s, v4.4s, v8.s[0] + fmla v29.4s, v5.4s, v8.s[0] + fmla v30.4s, v6.4s, v8.s[0] + fmla v31.4s, v7.4s, v8.s[0] + + subs x3, x3, #1 + bge .f16f32_loop + + b .non_linear_loop + +{% include "arm64simd_mmm_f32_scalars.tmpliq" from:24, to:31%} +{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:32, from:24, to:31%} +{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:32, from:24, to:31%} +{% include "arm64simd_mmm_load_tile.tmpliq" from:24, to:31 %} + +.add_unicast: + ldp x5, x6, [x0, #8] // c base ptr, rsc + cmp x6, #4 + beq .do_per_row_add + + {% for reg in (24..31) %} + {% for lane in (0..3) %} + ld1 {v0.s}[{{lane}}], [ x5 ], x6 + {% endfor %} + fadd v{{reg}}.4s, v{{reg}}.4s, v0.4s + {% endfor %} + + b .non_linear_loop + +.do_per_row_add: + ld1 {v0.4s-v3.4s}, [x5], #64 + ld1 {v4.4s-v7.4s}, [x5], #64 + + {% for r in (0..7) %} + fadd v{{r| plus: 24}}.4s, v{{r | plus: 24}}.4s, v{{r}}.4s + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldr x3, [x0, #16] + ldr x2, [x0, #8] + + ld1 {v8.s}[0], [ x3 ] + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x2 ], #64 + ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [ x2 ], #64 + + fmla v24.4s, v0.4s, v8.s[0] + fmla v25.4s, v1.4s, v8.s[0] + fmla v26.4s, v2.4s, v8.s[0] + fmla v27.4s, v3.4s, v8.s[0] + fmla v28.4s, v4.4s, v8.s[0] + fmla v29.4s, v5.4s, v8.s[0] + fmla v30.4s, v6.4s, v8.s[0] + fmla v31.4s, v7.4s, v8.s[0] + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + cmp x8, #2 + beq .store_f16 + + cmp x6, #4 + beq .store_strides_contig + + {% for reg in (24..31) %} + {% for lane in (0..3) %} + st1 { v{{reg}}.s }[{{lane}}], [ x5 ], x6 + {% endfor %} + {% endfor %} + b .non_linear_loop + +.store_strides_contig: + + {% for reg in (24..31) %} + st1 { v{{reg}}.4s }, [ x5 ], #16 + {% endfor %} + b .non_linear_loop + +.store_f16: + {% for reg in (0..3) %} + fcvtn v{{reg}}.4h, v{{reg|times:2|plus:24}}.4s + fcvtn2 v{{reg}}.8h, v{{reg|times:2|plus:25}}.4s + {% endfor %} + + cmp x6, #2 + beq .store_strides_contig_f16 + + {% for reg in (0..3) %} + {% for lane in (0..7) %} + st1 { v{{reg}}.h }[{{lane}}], [ x5 ], x6 + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.store_strides_contig_f16: + + {% for reg in (0..3) %} + st1 { v{{reg}}.8h }, [ x5 ], #16 + {% endfor %} + b .non_linear_loop + +.return: + + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ldp x26, x27, [sp], #16 + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x3_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x3_core.tmpl new file mode 100644 index 000000000..e434b2b5e --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x3_core.tmpl @@ -0,0 +1,307 @@ +// vim: ft=arm + +// C tile regs: v16 to v31, no need to preserve + +// no preservation either for v0-v7... +// v8..v15 are callee-preserved +// packed A buffering (2x8 values): alternating v0, v1 with v2, v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +.global {{G}}arm64simd_mmm_f32_32x3_{{core}}_{{suffix}} +{{G}}arm64simd_mmm_f32_32x3_{{core}}_{{suffix}}: + + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldp x2, x4, [x0, #24] // b, packing + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + + cmp x4, #1 + beq .f32f16 + cmp x4, #2 + beq .f16f32 + cmp x4, #3 + beq .f16f16 + +.p2align 4 +.packed_packed_loop_1: + ld1 { v7.4s }, [ x2 ] + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64 + ld1 { v4.4s, v5.4s, v6.4s }, [ x1 ], #48 + add x2, x2, #12 + +{% for col in (0..2) %} + fmla v{{ col|times:8|plus:8}}.4s, v0.4s, v7.s[{{ col }}] +{% endfor %} + + ld1 { v0.4s }, [ x1 ], #16 + +{% for row in (1..6) %} + {% for col in (0..2) %} + fmla v{{ col|times:8|plus:8|plus:row}}.4s, v{{row}}.4s, v7.s[{{col}}] + {% endfor %} +{% endfor %} + +{% for col in (0..2) %} + fmla v{{ col|times:8|plus:15}}.4s, v0.4s, v7.s[{{ col }}] +{% endfor %} + + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +.p2align 4 +.f32f16: + ld1 { v7.4h }, [ x2 ] + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64 + ld1 { v4.4s, v5.4s, v6.4s }, [ x1 ], #48 + fcvtl v7.4s, v7.4h + add x2, x2, #6 + +{% for col in (0..2) %} + fmla v{{ col|times:8|plus:8}}.4s, v0.4s, v7.s[{{ col }}] +{% endfor %} + + ld1 { v0.4s }, [ x1 ], #16 + +{% for row in (1..6) %} + {% for col in (0..2) %} + fmla v{{ col|times:8|plus:8|plus:row}}.4s, v{{row}}.4s, v7.s[{{col}}] + {% endfor %} +{% endfor %} + +{% for col in (0..2) %} + fmla v{{ col|times:8|plus:15}}.4s, v0.4s, v7.s[{{ col }}] +{% endfor %} + + subs x3, x3, #1 + bne .f32f16 + + b .non_linear_loop + +.p2align 4 +.f16f32: + ld1 { v7.4s }, [ x2 ] + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [ x1 ], #64 + add x2, x2, #12 + + fcvtl v4.4s, v0.4h + fcvtl2 v5.4s, v0.8h + fcvtl v6.4s, v1.4h + fcvtl2 v0.4s, v1.8h + + {% for col in (0..2) %} + fmla v{{ col|times:8|plus:8}}.4s, v4.4s, v7.s[{{col}}] + fmla v{{ col|times:8|plus:9}}.4s, v5.4s, v7.s[{{col}}] + fmla v{{ col|times:8|plus:10}}.4s, v6.4s, v7.s[{{col}}] + fmla v{{ col|times:8|plus:11}}.4s, v0.4s, v7.s[{{col}}] + {% endfor %} + + fcvtl v4.4s, v2.4h + fcvtl2 v5.4s, v2.8h + fcvtl v6.4s, v3.4h + fcvtl2 v1.4s, v3.8h + + {% for col in (0..2) %} + fmla v{{ col|times:8|plus:12}}.4s, v4.4s, v7.s[{{col}}] + fmla v{{ col|times:8|plus:13}}.4s, v5.4s, v7.s[{{col}}] + fmla v{{ col|times:8|plus:14}}.4s, v6.4s, v7.s[{{col}}] + fmla v{{ col|times:8|plus:15}}.4s, v1.4s, v7.s[{{col}}] + {% endfor %} + + subs x3, x3, #1 + bne .f16f32 + + b .non_linear_loop + +.p2align 4 +.f16f16: + ld1 { v7.4h }, [ x2 ] + ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [ x1 ], #64 + add x2, x2, #6 + + fcvtl v7.4s, v7.4h + + fcvtl v4.4s, v0.4h + fcvtl2 v5.4s, v0.8h + fcvtl v6.4s, v1.4h + fcvtl2 v0.4s, v1.8h + + {% for col in (0..2) %} + fmla v{{ col|times:8|plus:8}}.4s, v4.4s, v7.s[{{col}}] + fmla v{{ col|times:8|plus:9}}.4s, v5.4s, v7.s[{{col}}] + fmla v{{ col|times:8|plus:10}}.4s, v6.4s, v7.s[{{col}}] + fmla v{{ col|times:8|plus:11}}.4s, v0.4s, v7.s[{{col}}] + {% endfor %} + + fcvtl v4.4s, v2.4h + fcvtl2 v5.4s, v2.8h + fcvtl v6.4s, v3.4h + fcvtl2 v1.4s, v3.8h + + {% for col in (0..2) %} + fmla v{{ col|times:8|plus:12}}.4s, v4.4s, v7.s[{{col}}] + fmla v{{ col|times:8|plus:13}}.4s, v5.4s, v7.s[{{col}}] + fmla v{{ col|times:8|plus:14}}.4s, v6.4s, v7.s[{{col}}] + fmla v{{ col|times:8|plus:15}}.4s, v1.4s, v7.s[{{col}}] + {% endfor %} + + subs x3, x3, #1 + bne .f16f16 + + b .non_linear_loop + + +{% include "arm64simd_mmm_f32_scalars.tmpliq" from:8, to:31%} +{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:32, from:8, to:31%} +{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:32, from:8, to:31%} +{% include "arm64simd_mmm_load_tile.tmpliq" from:8, to:31 %} + +.add_unicast: + ldp x5, x6, [x0, #8] + ldp x7, x8, [x0, #24] + + {% for col in (0..2) %} + mov x4, x5 + {% for reg in (0..7) %} + {% for lane in (0..3) %} + ld1 {v0.s}[{{lane}}], [ x4 ], x6 + {% endfor %} + fadd v{{col | times:8 | plus: 8| plus: reg}}.4s, v{{col | times:8 | plus: 8 | plus: reg}}.4s, v0.4s + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldp x2, x3, [x0, #8] + + ld1 { v7.d }[0], [ x3 ], #8 + ld1 { v7.s }[2], [ x3 ], #4 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x2 ], #64 + ld1 { v4.4s, v5.4s, v6.4s }, [ x2 ], #48 + +{% for col in (0..2) %} + fmla v{{ col|times:8|plus:8}}.4s, v0.4s, v7.s[{{ col }}] +{% endfor %} + + ld1 { v0.4s }, [ x2 ], #16 + +{% for row in (1..6) %} + {% for col in (0..2) %} + fmla v{{ col|times:8|plus:8|plus:row}}.4s, v{{row}}.4s, v7.s[{{col}}] + {% endfor %} +{% endfor %} + +{% for col in (0..2) %} + fmla v{{ col|times:8|plus:15}}.4s, v0.4s, v7.s[{{ col }}] +{% endfor %} + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + cmp x8, #2 + beq .store_f16 + + cmp x6, #4 + beq .store_strides_contig + + + {% for col in (0..2) %} + mov x4, x5 + {% for reg in (0..7) %} + {% for lane in (0..3) %} + st1 { v{{col | times:8 | plus: 8 | plus: reg}}.s }[{{lane}}], [ x4 ], x6 + {% endfor %} + {% endfor %} + add x5, x5, x7 + {% endfor %} + b .non_linear_loop + +.store_strides_contig: + + {% for col in (0..2) %} + mov x4, x5 + {% for r in (0..7) %} + st1 { v{{col | times:8 | plus: 8 | plus: r}}.4s }, [ x4 ], 16 + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.store_f16: + + cmp x6, #2 + beq .store_strides_contig_f16 + + {% for col in (0..2) %} + {% for reg in (0..3) %} + fcvtn v{{reg}}.4h, v{{col|times:4|plus:reg|times:2|plus:8}}.4s + fcvtn2 v{{reg}}.8h, v{{col|times:4|plus:reg|times:2|plus:9}}.4s + {% endfor %} + + mov x4, x5 + {% for reg in (0..3) %} + {% for lane in (0..7) %} + st1 { v{{reg}}.h }[{{lane}}], [ x4 ], x6 + {% endfor %} + {% endfor %} + add x5, x5, x7 + + {% endfor %} + + + b .non_linear_loop + +.store_strides_contig_f16: + + {% for col in (0..2) %} + {% for reg in (0..3) %} + fcvtn v{{reg}}.4h, v{{col|times:4|plus:reg|times:2|plus:8}}.4s + fcvtn2 v{{reg}}.8h, v{{col|times:4|plus:reg|times:2|plus:9}}.4s + {% endfor %} + + mov x4, x5 + {% for reg in (0..3) %} + st1 { v{{reg}}.4s }, [ x4 ], #16 + {% endfor %} + add x5, x5, x7 + + {% endfor %} + b .non_linear_loop + + +.return: + + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/cortex_a53.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/cortex_a53.tmpli new file mode 100644 index 000000000..410816dff --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/cortex_a53.tmpli @@ -0,0 +1,65 @@ + fmla v16.4s, v0.4s, v8.s[0] + ldr x5, [x1, #128] + fmla v17.4s, v1.4s, v8.s[0] + ldr x6, [x1, #136] + fmla v18.4s, v2.4s, v8.s[0] + ldr x7, [x1, #144] + fmla v19.4s, v3.4s, v8.s[0] + ldr x9, [x1, #152] + ld1 {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [ x1 ], #64 + + fmla v20.4s, v4.4s, v8.s[0] + ldr x10, [x1, #96] + fmla v21.4s, v5.4s, v8.s[0] + ldr x11, [x1, #104] + fmla v22.4s, v6.4s, v8.s[0] + ldr x12, [x1, #112] + fmla v23.4s, v7.4s, v8.s[0] + ldr x13, [x1, #120] + + ld1 {{ v4.4s, v5.4s, v6.4s, v7.4s }}, [ x1 ] + + fmla v24.4s, v0.4s, v8.s[0] + ldr x14, [x1, #128] + fmla v25.4s, v1.4s, v8.s[0] + ldr x15, [x1, #136] + fmla v26.4s, v2.4s, v8.s[0] + ldr x20, [x1, #144] + fmla v27.4s, v3.4s, v8.s[0] + ldr x21, [x1, #152] + fmla v28.4s, v4.4s, v8.s[0] + ldr x22, [x1, #160] + fmla v29.4s, v5.4s, v8.s[0] + ldr x23, [x1, #168] + fmla v30.4s, v6.4s, v8.s[0] + ldr x24, [x1, #176] + fmla v31.4s, v7.4s, v8.s[0] + ldr x25, [x1, #184] + + ld1 {{ v8.s }}[0], [ x2 ], #4 + + prfm pldl1keep, [x1, #1024] + prfm pldl1keep, [x1, #1088] + prfm pldl1keep, [x1, #1152] + prfm pldl1keep, [x1, #1216] + prfm pldl1keep, [x2, #256] + + ins v0.d[0], x5 + ins v1.d[0], x7 + ins v2.d[0], x10 + ins v3.d[0], x12 + ins v4.d[0], x14 + ins v5.d[0], x20 + ins v6.d[0], x22 + ins v7.d[0], x24 + + ins v0.d[1], x6 + ins v1.d[1], x9 + ins v2.d[1], x11 + ins v3.d[1], x13 + ins v4.d[1], x15 + ins v5.d[1], x21 + ins v6.d[1], x23 + ins v7.d[1], x25 + + add x1, x1, #192 diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/naive.tmpli new file mode 100644 index 000000000..c65deb967 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/naive.tmpli @@ -0,0 +1,32 @@ + ld1 {{ v9.4s, v10.4s, v11.4s, v12.4s }}, [x1], #64 + ld1 {{ v13.4s, v14.4s, v15.4s }}, [x1], #48 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] + fmla v20.4s, v4.4s, v8.s[0] + fmla v21.4s, v5.4s, v8.s[0] + fmla v22.4s, v6.4s, v8.s[0] + fmla v23.4s, v7.4s, v8.s[0] + fmla v24.4s, v9.4s, v8.s[0] + ld1 {{ v9.4s }}, [ x1 ], #16 + ld1 {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [x1], #64 + ld1 {{ v4.4s, v5.4s, v6.4s, v7.4s }}, [x1], #64 + fmla v25.4s, v10.4s, v8.s[0] + fmla v26.4s, v11.4s, v8.s[0] + fmla v27.4s, v12.4s, v8.s[0] + fmla v28.4s, v13.4s, v8.s[0] + fmla v29.4s, v14.4s, v8.s[0] + fmla v30.4s, v15.4s, v8.s[0] + + fmla v31.4s, v9.4s, v8.s[0] + + ld1 {{ v8.s }}[0], [ x2 ], #4 + + prfm pldl1keep, [x1, #1024] + prfm pldl1keep, [x1, #1088] + prfm pldl1keep, [x1, #1152] + prfm pldl1keep, [x1, #1216] + prfm pldl1keep, [x2, #256] + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/cortex_a55.tmpli new file mode 100644 index 000000000..2a5f06603 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/cortex_a55.tmpli @@ -0,0 +1,85 @@ + ld1 {{ v9.4s, v10.4s, v11.4s }}, [x1], #48 + + fmla v16.4s, v0.4s, v8.s[0] + ldr x8, [x2], #8 + fmla v17.4s, v1.4s, v8.s[0] + ldr d12, [x1], #8 + fmla v18.4s, v2.4s, v8.s[0] + ldr x12, [x1], #8 + fmla v19.4s, v3.4s, v8.s[0] + ldr d13, [x1], #8 + fmla v20.4s, v4.4s, v8.s[0] + ldr x13, [x1], #8 + fmla v21.4s, v5.4s, v8.s[0] + ldr d14, [x1], #8 + fmla v22.4s, v6.4s, v8.s[0] + ldr x14, [x1], #8 + fmla v23.4s, v7.4s, v8.s[0] + ldr d15, [x1], #8 + fmla v24.4s, v9.4s, v8.s[0] + ldr x15, [x1], #8 + + ld1 {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [x1], #64 + ins v8.d[1], x8 + ld1 {{ v4.4s, v5.4s, v6.4s, v7.4s }}, [x1], #64 + + fmla v25.4s, v10.4s, v8.s[0] + ins v12.d[1], x12 + fmla v26.4s, v11.4s, v8.s[0] + ins v13.d[1], x13 + fmla v27.4s, v12.4s, v8.s[0] + ins v14.d[1], x14 + fmla v28.4s, v13.4s, v8.s[0] + ins v15.d[1], x15 + + ld1 {{ v9.4s, v10.4s, v11.4s, v12.4s }}, [x1], #64 + + fmla v29.4s, v14.4s, v8.s[0] + ldr d13, [x1], #8 + fmla v30.4s, v15.4s, v8.s[0] + ldr x13, [x1], #8 + fmla v31.4s, v0.4s, v8.s[0] + ldr d14, [x1], #8 + + fmla v16.4s, v1.4s, v8.s[2] + ldr x14, [x1], #8 + fmla v17.4s, v2.4s, v8.s[2] + ldr d15, [x1], #8 + fmla v18.4s, v3.4s, v8.s[2] + ldr x15, [x1], #8 + fmla v19.4s, v4.4s, v8.s[2] + + ld1 {{ v0.4s }}, [x1], #16 + + fmla v20.4s, v5.4s, v8.s[2] + ldr d1, [x1], #8 + fmla v21.4s, v6.4s, v8.s[2] + ldr x10, [x1], #8 + + fmla v22.4s, v7.4s, v8.s[2] + + fmla v23.4s, v9.4s, v8.s[2] + ins v13.d[1], x13 + fmla v24.4s, v10.4s, v8.s[2] + ins v14.d[1], x14 + fmla v25.4s, v11.4s, v8.s[2] + ins v15.d[1], x15 + + fmla v26.4s, v12.4s, v8.s[2] + prfm pldl1keep, [x1, #1024] + fmla v27.4s, v13.4s, v8.s[2] + ins v1.d[1], x10 + fmla v28.4s, v14.4s, v8.s[2] + prfm pldl1keep, [x1, #1088] + fmla v29.4s, v15.4s, v8.s[2] + prfm pldl1keep, [x1, #1152] + fmla v30.4s, v0.4s, v8.s[2] + prfm pldl1keep, [x1, #1216] + fmla v31.4s, v1.4s, v8.s[2] + prfm pldl1keep, [x2, #256] + + ld1 {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [x1], #64 + ins v8.s[0], v8.s[3] + ld1 {{ v4.4s, v5.4s, v6.4s, v7.4s }}, [x1], #64 + + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/naive.tmpli new file mode 100644 index 000000000..cba3dadc5 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/naive.tmpli @@ -0,0 +1,66 @@ +// load a: v9, v10, v11, v12, v13, v14, v15 +// load a: v0, v1, v2, v3, v4, v4, v6, v7 + + ld1 {{ v9.4s, v10.4s, v11.4s, v12.4s }}, [x1], #64 + ld1 {{ v13.4s, v14.4s, v15.4s }}, [x1], #48 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] + + ld1 {{ v0.4s, v1.4s }}, [x1], #32 + + fmla v20.4s, v4.4s, v8.s[0] + fmla v21.4s, v5.4s, v8.s[0] + + ld1 {{ v2.4s, v3.4s, v4.4s, v5.4s }}, [x1], #64 + fmla v22.4s, v6.4s, v8.s[0] + fmla v23.4s, v7.4s, v8.s[0] + + ld1 {{ v6.4s, v7.4s }}, [x1], #32 + + fmla v24.4s, v9.4s, v8.s[0] + fmla v25.4s, v10.4s, v8.s[0] + fmla v26.4s, v11.4s, v8.s[0] + fmla v27.4s, v12.4s, v8.s[0] + fmla v28.4s, v13.4s, v8.s[0] + fmla v29.4s, v14.4s, v8.s[0] + fmla v30.4s, v15.4s, v8.s[0] + + ld1 {{ v9.4s, v10.4s, v11.4s, v12.4s }}, [x1], #64 + ld1 {{ v13.4s, v14.4s, v15.4s }}, [x1], #48 + + fmla v31.4s, v0.4s, v8.s[0] + ld1 {{ v8.s }}[0], [ x2 ], #4 + + fmla v16.4s, v1.4s, v8.s[0] + ld1 {{ v0.4s, v1.4s }}, [x1], #32 + fmla v17.4s, v2.4s, v8.s[0] + fmla v18.4s, v3.4s, v8.s[0] + fmla v19.4s, v4.4s, v8.s[0] + + fmla v20.4s, v5.4s, v8.s[0] + fmla v21.4s, v6.4s, v8.s[0] + fmla v22.4s, v7.4s, v8.s[0] + fmla v23.4s, v9.4s, v8.s[0] + + fmla v24.4s, v10.4s, v8.s[0] + fmla v25.4s, v11.4s, v8.s[0] + fmla v26.4s, v12.4s, v8.s[0] + fmla v27.4s, v13.4s, v8.s[0] + fmla v28.4s, v14.4s, v8.s[0] + fmla v29.4s, v15.4s, v8.s[0] + fmla v30.4s, v0.4s, v8.s[0] + fmla v31.4s, v1.4s, v8.s[0] + ld1 {{ v8.s }}[0], [ x2 ], #4 + + ld1 {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [x1], #64 + ld1 {{ v4.4s, v5.4s, v6.4s, v7.4s }}, [x1], #64 + + prfm pldl1keep, [x1, #1024] + prfm pldl1keep, [x1, #1088] + prfm pldl1keep, [x1, #1152] + prfm pldl1keep, [x1, #1216] + prfm pldl1keep, [x2, #256] + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1_core.tmpl new file mode 100644 index 000000000..1f0c6ce41 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1_core.tmpl @@ -0,0 +1,225 @@ +// vim: ft=arm + +// C tile regs: +// - x19-x29 to preserve (but x19, x28, x29 not used) +// - d8..d15 to preserve +// - v16 to v31, no need to preserve +// +// v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0] +// v16[1] v18[1] +// v16[2] v18[2] +// v16[3] v18[3] +// +// v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0] +// v17[1] v19[1] +// v17[2] v19[2] +// v17[3] v19[3] + +// packed A buffering (2x8 values): alternating v0, v1 with v2, v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +.cpu generic+fp+simd +.global {{G}}arm64simd_mmm_f32_64x1_{{core}}_{{suffix}} +{{G}}arm64simd_mmm_f32_64x1_{{core}}_{{suffix}}: + + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + stp x26, x27, [sp, #-16]! + + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldr x2, [x0, #24] // b + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + sub x3, x3, #1 + + + ld1 { v8.s }[0], [ x2 ], #4 + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64 + ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [ x1 ], #64 + + cmp x3, #0 + beq .packed_packed_loop_1_last + + cmp x3, #4 + blt .packed_packed_loop_1 + +{% capture packed_packed_loop1 %} + {% if core == "a53" %} + {% include "arm64simd_mmm_f32_64x1/loop1/cortex_a53.tmpli" %} + {% else %} + {% include "arm64simd_mmm_f32_64x1/loop1/naive.tmpli" %} + {% endif %} +{% endcapture %} + +{% capture packed_packed_loop2 %} + {% if core == "a53" %} + {{ packed_packed_loop1 }} + {{ packed_packed_loop1 }} + {% elsif core == "a55" %} + {% include "arm64simd_mmm_f32_64x1/loop2/cortex_a55.tmpli" %} + {% else %} + {% include "arm64simd_mmm_f32_64x1/loop2/naive.tmpli" %} + {% endif %} +{% endcapture %} + +.p2align 4 +.packed_packed_loop_4: + {{ packed_packed_loop2 }} + {{ packed_packed_loop2 }} + + sub x3, x3, #4 + cmp x3, #4 + bge .packed_packed_loop_4 + + cmp x3, #0 + beq .packed_packed_loop_1_last + +.p2align 4 +.packed_packed_loop_1: + {{ packed_packed_loop1 }} + + subs x3, x3, #1 + bne .packed_packed_loop_1 + +// last loop can't read beyond actual input as it's likely not packed and padded +.packed_packed_loop_1_last: + ld1 { v9.4s, v10.4s, v11.4s, v12.4s }, [x1], #64 + ld1 { v13.4s, v14.4s, v15.4s }, [x1], #48 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + ld1 { v0.4s }, [ x1 ] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] + fmla v20.4s, v4.4s, v8.s[0] + fmla v21.4s, v5.4s, v8.s[0] + fmla v22.4s, v6.4s, v8.s[0] + fmla v23.4s, v7.4s, v8.s[0] + + fmla v24.4s, v9.4s, v8.s[0] + fmla v25.4s, v10.4s, v8.s[0] + fmla v26.4s, v11.4s, v8.s[0] + fmla v27.4s, v12.4s, v8.s[0] + fmla v28.4s, v13.4s, v8.s[0] + fmla v29.4s, v14.4s, v8.s[0] + fmla v30.4s, v15.4s, v8.s[0] + fmla v31.4s, v0.4s, v8.s[0] + + b .non_linear_loop + +{% include "arm64simd_mmm_f32_scalars.tmpliq" from:16, to:31%} +{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:64, from:16, to:31%} +{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:64, from:16, to:31%} +{% include "arm64simd_mmm_load_tile.tmpliq" from:16, to:31 %} + +.add_unicast: + ldp x5, x6, [x0, #8] // c base ptr, rsc + cmp x6, #4 + beq .do_per_row_add + + {% for reg in (16..31) %} + {% for lane in (0..3) %} + ld1 {v0.s}[{{lane}}], [ x5 ], x6 + {% endfor %} + fadd v{{reg}}.4s, v{{reg}}.4s, v0.4s + {% endfor %} + + b .non_linear_loop + +.do_per_row_add: + ld1 {v0.4s-v3.4s}, [x5], #64 + ld1 {v4.4s-v7.4s}, [x5], #64 + ld1 {v8.4s-v11.4s}, [x5], #64 + ld1 {v12.4s-v15.4s}, [x5], #64 + + {% for r in (0..15) %} + fadd v{{r| plus: 16}}.4s, v{{r | plus: 16}}.4s, v{{r}}.4s + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldr x3, [x0, #16] + ldr x2, [x0, #8] + + ld1 {v8.s}[0], [ x3 ] + + {% for r in (0..7) %} + ldr q{{r}}, [x2], #16 + {% endfor %} + + fmla v16.4s, v0.4s, v8.s[0] + ldr q0, [x2], #16 + fmla v17.4s, v1.4s, v8.s[0] + ldr q1, [x2], #16 + fmla v18.4s, v2.4s, v8.s[0] + ldr q2, [x2], #16 + fmla v19.4s, v3.4s, v8.s[0] + ldr q3, [x2], #16 + fmla v20.4s, v4.4s, v8.s[0] + ldr q4, [x2], #16 + fmla v21.4s, v5.4s, v8.s[0] + ldr q5, [x2], #16 + fmla v22.4s, v6.4s, v8.s[0] + ldr q6, [x2], #16 + fmla v23.4s, v7.4s, v8.s[0] + ldr q7, [x2], #16 + + fmla v24.4s, v0.4s, v8.s[0] + fmla v25.4s, v1.4s, v8.s[0] + fmla v26.4s, v2.4s, v8.s[0] + fmla v27.4s, v3.4s, v8.s[0] + fmla v28.4s, v4.4s, v8.s[0] + fmla v29.4s, v5.4s, v8.s[0] + fmla v30.4s, v6.4s, v8.s[0] + fmla v31.4s, v7.4s, v8.s[0] + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc$ + + cmp x6, #4 + beq .store_strides_contig + + {% for reg in (16..31) %} + {% for lane in (0..3) %} + st1 { v{{reg}}.s }[{{lane}}], [ x5 ], x6 + {% endfor %} + {% endfor %} + b .non_linear_loop + +.store_strides_contig: + + {% for reg in (16..31) %} + st1 { v{{reg}}.4s }, [ x5 ], #16 + {% endfor %} + b .non_linear_loop + +.return: + + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ldp x26, x27, [sp], #16 + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/broken_chains.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/broken_chains.tmpli new file mode 100644 index 000000000..9b3035b21 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/broken_chains.tmpli @@ -0,0 +1,25 @@ +ld1 {{ v2.4s, v3.4s }}, [x1], #32 +ld1 {{ v6.4s, v7.4s }}, [x2], #32 + +fmla v16.4s, v0.4s, v4.s[0] +fmla v17.4s, v1.4s, v4.s[0] +fmla v18.4s, v0.4s, v4.s[1] +fmla v19.4s, v1.4s, v4.s[1] +fmla v20.4s, v0.4s, v4.s[2] +fmla v21.4s, v1.4s, v4.s[2] +fmla v22.4s, v0.4s, v4.s[3] +fmla v23.4s, v1.4s, v4.s[3] + +fmla v24.4s, v0.4s, v5.s[0] +fmla v25.4s, v1.4s, v5.s[0] +fmla v26.4s, v0.4s, v5.s[1] +fmla v27.4s, v1.4s, v5.s[1] +fmla v28.4s, v0.4s, v5.s[2] +fmla v29.4s, v1.4s, v5.s[2] +fmla v30.4s, v0.4s, v5.s[3] +fmla v31.4s, v1.4s, v5.s[3] + +and v0.16b, v2.16b, v2.16b +and v1.16b, v3.16b, v3.16b +and v4.16b, v6.16b, v6.16b +and v5.16b, v7.16b, v7.16b diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_no_preload.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_no_preload.tmpli new file mode 100644 index 000000000..ac5bdc5bb --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_no_preload.tmpli @@ -0,0 +1,51 @@ + +fmla v16.4s, v0.4s, v4.s[0] +ldr w5, [x1], #4 +fmla v17.4s, v1.4s, v4.s[0] +ldr w20, [x2], #4 +fmla v18.4s, v0.4s, v4.s[1] +ldr w6, [x1], #4 +fmla v20.4s, v1.4s, v4.s[1] +ldr w21, [x2], #4 +fmla v20.4s, v0.4s, v4.s[2] +ldr w7, [x1], #4 +fmla v21.4s, v1.4s, v4.s[2] +ldr w22, [x2], #4 +fmla v22.4s, v0.4s, v4.s[3] +ldr w8, [x1], #4 +fmla v23.4s, v1.4s, v4.s[3] +ldr w23, [x2], #4 + +fmla v24.4s, v0.4s, v5.s[0] +ldr w9, [x1], #4 +fmla v25.4s, v1.4s, v5.s[0] +ldr w24, [x2], #4 +fmla v26.4s, v0.4s, v5.s[1] +ldr w10, [x1], #4 +fmla v27.4s, v1.4s, v5.s[1] +ldr w25, [x2], #4 +fmla v28.4s, v0.4s, v5.s[2] +ldr w11, [x1], #4 +fmla v29.4s, v1.4s, v5.s[2] +ldr w26, [x2], #4 +fmla v30.4s, v0.4s, v5.s[3] +ldr w12, [x1], #4 +fmla v31.4s, v1.4s, v5.s[3] +ldr w27, [x2], #4 + +ins v0.s[0], w5 +ins v4.s[0], w20 +ins v1.s[0], w9 +ins v5.s[0], w24 +ins v0.s[2], w7 +ins v4.s[2], w22 +ins v1.s[2], w11 +ins v5.s[2], w26 +ins v0.s[1], w6 +ins v4.s[1], w21 +ins v1.s[1], w10 +ins v5.s[1], w25 +ins v0.s[3], w8 +ins v4.s[3], w23 +ins v1.s[3], w12 +ins v5.s[3], w27 diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_preload.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_preload.tmpli new file mode 100644 index 000000000..3afc78c7b --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_preload.tmpli @@ -0,0 +1,54 @@ +fmla v16.4s, v0.4s, v4.s[0] +ldr w5, [x1], #4 +fmla v17.4s, v1.4s, v4.s[0] +ldr w20, [x2], #4 +fmla v18.4s, v0.4s, v4.s[1] +ldr w6, [x1], #4 +fmla v19.4s, v1.4s, v4.s[1] +ldr w21, [x2], #4 +fmla v20.4s, v0.4s, v4.s[2] +ldr w7, [x1], #4 +fmla v21.4s, v1.4s, v4.s[2] +ldr w22, [x2], #4 +fmla v22.4s, v0.4s, v4.s[3] +ldr w8, [x1], #4 +fmla v23.4s, v1.4s, v4.s[3] +ldr w23, [x2], #4 + +fmla v24.4s, v0.4s, v5.s[0] +ldr w9, [x1], #4 +fmla v25.4s, v1.4s, v5.s[0] +ldr w24, [x2], #4 +fmla v26.4s, v0.4s, v5.s[1] +ldr w10, [x1], #4 +fmla v27.4s, v1.4s, v5.s[1] +ldr w25, [x2], #4 +fmla v28.4s, v0.4s, v5.s[2] +ldr w11, [x1], #4 +fmla v29.4s, v1.4s, v5.s[2] +ldr w26, [x2], #4 +fmla v30.4s, v0.4s, v5.s[3] +ldr w12, [x1], #4 +fmla v31.4s, v1.4s, v5.s[3] +ldr w27, [x2], #4 + +prfm pldl1keep, [x1, #256] +prfm pldl1keep, [x2, #256] + +ins v0.s[0], w5 +ins v4.s[0], w20 +ins v1.s[0], w9 +ins v5.s[0], w24 +ins v0.s[2], w7 +ins v4.s[2], w22 +ins v1.s[2], w11 +ins v5.s[2], w26 +ins v0.s[1], w6 +ins v4.s[1], w21 +ins v1.s[1], w10 +ins v5.s[1], w25 +ins v0.s[3], w8 +ins v4.s[3], w23 +ins v1.s[3], w12 +ins v5.s[3], w27 + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_no_preload.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_no_preload.tmpli new file mode 100644 index 000000000..e3822d347 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_no_preload.tmpli @@ -0,0 +1,35 @@ + +fmla v16.4s, v0.4s, v4.s[0] +ldr x5, [x1], #8 +fmla v17.4s, v1.4s, v4.s[0] +ldr x9, [x2], #8 +fmla v18.4s, v0.4s, v4.s[1] +ldr x6, [x1], #8 +fmla v19.4s, v1.4s, v4.s[1] +ldr x10, [x2], #8 +fmla v20.4s, v0.4s, v4.s[2] +ldr x7, [x1], #8 +fmla v21.4s, v1.4s, v4.s[2] +ldr x11, [x2], #8 +fmla v22.4s, v0.4s, v4.s[3] +ldr x8, [x1], #8 +fmla v23.4s, v1.4s, v4.s[3] +ldr x12, [x2], #8 + +fmla v24.4s, v0.4s, v5.s[0] +fmla v25.4s, v1.4s, v5.s[0] +fmla v26.4s, v0.4s, v5.s[1] +fmla v27.4s, v1.4s, v5.s[1] +fmla v28.4s, v0.4s, v5.s[2] +fmla v29.4s, v1.4s, v5.s[2] +fmla v30.4s, v0.4s, v5.s[3] +fmla v31.4s, v1.4s, v5.s[3] + +ins v2.d[0], x5 +ins v6.d[0], x9 +ins v3.d[0], x7 +ins v7.d[0], x11 +ins v2.d[1], x6 +ins v6.d[1], x10 +ins v3.d[1], x8 +ins v7.d[1], x12 diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_preload.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_preload.tmpli new file mode 100644 index 000000000..11081e84f --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_preload.tmpli @@ -0,0 +1,43 @@ + +fmla v16.4s, v0.4s, v4.s[0] +ldr x5, [x1], #8 +fmla v17.4s, v1.4s, v4.s[0] +ldr x9, [x2], #8 +fmla v18.4s, v0.4s, v4.s[1] +ldr x6, [x1], #8 +fmla v19.4s, v1.4s, v4.s[1] +ldr x10, [x2], #8 +fmla v20.4s, v0.4s, v4.s[2] +ldr x7, [x1], #8 +fmla v21.4s, v1.4s, v4.s[2] +ldr x11, [x2], #8 +fmla v22.4s, v0.4s, v4.s[3] +ldr x8, [x1], #8 +fmla v23.4s, v1.4s, v4.s[3] +ldr x12, [x2], #8 + +fmla v24.4s, v0.4s, v5.s[0] +prfm pldl1keep, [x1, #256] +fmla v25.4s, v1.4s, v5.s[0] +prfm pldl1keep, [x1, #320] +fmla v26.4s, v0.4s, v5.s[1] +prfm pldl1keep, [x1, #384] +fmla v27.4s, v1.4s, v5.s[1] +prfm pldl1keep, [x1, #448] +fmla v28.4s, v0.4s, v5.s[2] +prfm pldl1keep, [x2, #256] +fmla v29.4s, v1.4s, v5.s[2] +prfm pldl1keep, [x2, #320] +fmla v30.4s, v0.4s, v5.s[3] +prfm pldl1keep, [x2, #384] +fmla v31.4s, v1.4s, v5.s[3] +prfm pldl1keep, [x2, #448] + +ins v0.d[0], x5 +ins v4.d[0], x9 +ins v1.d[0], x7 +ins v5.d[0], x11 +ins v0.d[1], x6 +ins v4.d[1], x10 +ins v1.d[1], x8 +ins v5.d[1], x12 diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/naive.tmpli new file mode 100644 index 000000000..14abb2a87 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/naive.tmpli @@ -0,0 +1,21 @@ + +fmla v16.4s, v0.4s, v4.s[0] +fmla v17.4s, v1.4s, v4.s[0] +fmla v18.4s, v0.4s, v4.s[1] +fmla v19.4s, v1.4s, v4.s[1] +fmla v20.4s, v0.4s, v4.s[2] +fmla v21.4s, v1.4s, v4.s[2] +fmla v22.4s, v0.4s, v4.s[3] +fmla v23.4s, v1.4s, v4.s[3] + +fmla v24.4s, v0.4s, v5.s[0] +fmla v25.4s, v1.4s, v5.s[0] +fmla v26.4s, v0.4s, v5.s[1] +fmla v27.4s, v1.4s, v5.s[1] +fmla v28.4s, v0.4s, v5.s[2] +fmla v29.4s, v1.4s, v5.s[2] +fmla v30.4s, v0.4s, v5.s[3] +fmla v31.4s, v1.4s, v5.s[3] + +ld1 {{ v0.4s, v1.4s }}, [x1], #32 +ld1 {{ v4.4s, v5.4s }}, [x2], #32 diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/broken_chains.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/broken_chains.tmpli new file mode 100644 index 000000000..5235ac6c2 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/broken_chains.tmpli @@ -0,0 +1,41 @@ +ld1 {{ v2.4s, v3.4s }}, [x1], #32 +ld1 {{ v6.4s, v7.4s }}, [x2], #32 + +fmla v16.4s, v0.4s, v4.s[0] +fmla v17.4s, v1.4s, v4.s[0] +fmla v18.4s, v0.4s, v4.s[1] +fmla v19.4s, v1.4s, v4.s[1] +fmla v20.4s, v0.4s, v4.s[2] +fmla v21.4s, v1.4s, v4.s[2] +fmla v22.4s, v0.4s, v4.s[3] +fmla v23.4s, v1.4s, v4.s[3] + +fmla v24.4s, v0.4s, v5.s[0] +fmla v25.4s, v1.4s, v5.s[0] +fmla v26.4s, v0.4s, v5.s[1] +fmla v27.4s, v1.4s, v5.s[1] +fmla v28.4s, v0.4s, v5.s[2] +fmla v29.4s, v1.4s, v5.s[2] +fmla v30.4s, v0.4s, v5.s[3] +fmla v31.4s, v1.4s, v5.s[3] + +ld1 {{ v0.4s, v1.4s }}, [x1], #32 +ld1 {{ v4.4s, v5.4s }}, [x2], #32 + +fmla v16.4s, v2.4s, v6.s[0] +fmla v17.4s, v3.4s, v6.s[0] +fmla v18.4s, v2.4s, v6.s[1] +fmla v19.4s, v3.4s, v6.s[1] +fmla v20.4s, v2.4s, v6.s[2] +fmla v21.4s, v3.4s, v6.s[2] +fmla v22.4s, v2.4s, v6.s[3] +fmla v23.4s, v3.4s, v6.s[3] + +fmla v24.4s, v2.4s, v7.s[0] +fmla v25.4s, v3.4s, v7.s[0] +fmla v26.4s, v2.4s, v7.s[1] +fmla v27.4s, v3.4s, v7.s[1] +fmla v28.4s, v2.4s, v7.s[2] +fmla v29.4s, v3.4s, v7.s[2] +fmla v30.4s, v2.4s, v7.s[3] +fmla v31.4s, v3.4s, v7.s[3] diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/cortex_a55.tmpli new file mode 100644 index 000000000..7f8759688 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/cortex_a55.tmpli @@ -0,0 +1,60 @@ +fmla v16.4s, v0.4s, v4.s[0] +ldr d2, [x1], #8 +fmla v17.4s, v1.4s, v4.s[0] +ldr d6, [x2], #8 +fmla v18.4s, v0.4s, v4.s[1] +ldr x5, [x1], #8 +fmla v19.4s, v1.4s, v4.s[1] +ldr x7, [x2], #8 +fmla v20.4s, v0.4s, v4.s[2] +ldr d3, [x1], #8 +fmla v21.4s, v1.4s, v4.s[2] +ldr d7, [x2], #8 +fmla v22.4s, v0.4s, v4.s[3] +ldr x6, [x1], #8 +fmla v23.4s, v1.4s, v4.s[3] +ldr x8, [x2], #8 + +fmla v24.4s, v0.4s, v5.s[0] +fmla v25.4s, v1.4s, v5.s[0] +fmla v26.4s, v0.4s, v5.s[1] +fmla v27.4s, v1.4s, v5.s[1] +fmla v28.4s, v0.4s, v5.s[2] +ins v2.d[1], x5 +fmla v29.4s, v1.4s, v5.s[2] +ins v6.d[1], x7 +fmla v30.4s, v0.4s, v5.s[3] +ins v3.d[1], x6 +fmla v31.4s, v1.4s, v5.s[3] +ins v7.d[1], x8 + +fmla v16.4s, v2.4s, v6.s[0] +ldr d0, [x1], #8 +fmla v17.4s, v3.4s, v6.s[0] +ldr d4, [x2], #8 +fmla v18.4s, v2.4s, v6.s[1] +ldr x5, [x1], #8 +fmla v19.4s, v3.4s, v6.s[1] +ldr x7, [x2], #8 +fmla v20.4s, v2.4s, v6.s[2] +ldr d1, [x1], #8 +fmla v21.4s, v3.4s, v6.s[2] +ldr d5, [x2], #8 +fmla v22.4s, v2.4s, v6.s[3] +ldr x6, [x1], #8 +fmla v23.4s, v3.4s, v6.s[3] +ldr x8, [x2], #8 + +fmla v24.4s, v2.4s, v7.s[0] +fmla v25.4s, v3.4s, v7.s[0] +fmla v26.4s, v2.4s, v7.s[1] +fmla v27.4s, v3.4s, v7.s[1] +fmla v28.4s, v2.4s, v7.s[2] +ins v0.d[1], x5 +fmla v29.4s, v3.4s, v7.s[2] +ins v4.d[1], x7 +fmla v30.4s, v2.4s, v7.s[3] +ins v1.d[1], x6 +fmla v31.4s, v3.4s, v7.s[3] +ins v5.d[1], x8 + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8_core.tmpl new file mode 100644 index 000000000..8bef26cc5 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8_core.tmpl @@ -0,0 +1,182 @@ +// vim: ft=arm + +// C tile regs: v16 to v31, (scratch) +// - x19-x29 to preserve (but x19, x28, x29 not used) +// - d8..d15 to preserve +// - v16 to v31, no need to preserve +// +// v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0] +// v16[1] v18[1] +// v16[2] v18[2] +// v16[3] v18[3] +// +// v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0] +// v17[1] v19[1] +// v17[2] v19[2] +// v17[3] v19[3] + +// v0-v7 (scratch registers) +// packed A buffering (2x8 values): alternating v0, v1 with v2, v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +.cpu generic+fp+simd +.global {{G}}arm64simd_mmm_f32_8x8_{{core}}_{{suffix}} +{{G}}arm64simd_mmm_f32_8x8_{{core}}_{{suffix}}: + + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + stp x26, x27, [sp, #-16]! + + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldr x2, [x0, #24] // b + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + +.packed_packed: + ld1 { v0.4s, v1.4s }, [ x1 ], #32 + ld1 { v4.4s, v5.4s }, [ x2 ], #32 + +{% capture packed_packed_loop1 %} + {% if core == "a53" %} + {% include "arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_preload.tmpli" %} + {% else %} + {% include "arm64simd_mmm_f32_8x8/packed_packed_loop1/naive.tmpli" %} + {% endif %} +{% endcapture %} + +{% capture packed_packed_loop2 %} + {% if core == "a55" %} + {% include "arm64simd_mmm_f32_8x8/packed_packed_loop2/cortex_a55.tmpli" %} + {% else %} + {{ packed_packed_loop1 }} + {{ packed_packed_loop1 }} + {% endif %} +{% endcapture %} + + cmp x3, #4 + blt .packed_packed_loop_1 + +.p2align 4 +.packed_packed_loop_4: + {{ packed_packed_loop2 }} + {{ packed_packed_loop2 }} + + sub x3, x3, #4 + cmp x3, #4 + bge .packed_packed_loop_4 + + + cmp x3, #0 + beq .non_linear_loop + +.p2align 4 +.packed_packed_loop_1: + {{ packed_packed_loop1 }} + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +{% include "arm64simd_mmm_f32_scalars.tmpliq" from:16, to:31%} +{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:8, from:16, to:31 %} +{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:8, from:16, to:31 %} +{% include "arm64simd_mmm_load_tile.tmpliq" from:16, to:31 %} + +.add_unicast: + ldp x5, x6, [x0, #8] + ldp x7, x8, [x0, #24] + + {% for col in (8..15) %} + mov x4, x5 + {% for reg in (0..1) %} + {% for lane in (0..3) %} + ld1 {v0.s}[{{lane}}], [ x4 ], x6 + {% endfor %} + fadd v{{col | times:2 | plus: reg}}.4s, v{{col | times:2 | plus: reg}}.4s, v0.4s + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldr x2, [x0, #8] + ldr x3, [x0, #16] + + ld1 { v0.4s, v1.4s }, [ x2 ], #32 + ld1 { v4.4s, v5.4s }, [ x3 ], #32 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] + + b .non_linear_loop + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + cmp x6, #4 + bne .store_strides_generic + + {% for col in (8..15) %} + str q{{col | times:2 }}, [ x5 ] + str q{{col | times:2 | plus: 1}}, [ x5, #16 ] + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.store_strides_generic: + + {% for col in (8..15) %} + mov x4, x5 + {% for reg in (0..1) %} + {% for lane in (0..3) %} + st1 { v{{col | times:2 | plus: reg}}.s }[{{lane}}], [ x4 ], x6 + {% endfor %} + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.return: + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ldp x26, x27, [sp], #16 + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_cols.tmpliq new file mode 100644 index 000000000..ce1ffe1f1 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_cols.tmpliq @@ -0,0 +1,9 @@ +// vim: ft=arm + +{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_min", op:"fmin", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_max", op:"fmax", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_mul", op:"fmul", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_add", op:"fadd", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_sub", op:"fsub", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_sub_flipped", op:"fsub", mr:mr, from:from, to:to, flipped: true%} + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_rows.tmpliq new file mode 100644 index 000000000..c518a6b4e --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_rows.tmpliq @@ -0,0 +1,9 @@ +// vim: ft=arm + +{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_min", op:"fmin", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_max", op:"fmax", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_mul", op:"fmul", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_add", op:"fadd", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_sub", op:"fsub", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_sub_flipped", op:"fsub", mr:mr, from:from, to:to, flipped: true%} + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_scalars.tmpliq new file mode 100644 index 000000000..cc053df02 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_scalars.tmpliq @@ -0,0 +1,36 @@ +// vim: ft=arm + +{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_min", op:"fmin", from:from, to:to %} +{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_max", op:"fmax", from:from, to:to %} +{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_mul", op:"fmul", from:from, to:to %} +{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_add", op:"fadd", from:from, to:to %} +{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_sub", op:"fsub", from:from, to:to %} +{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_sub_flipped", op:"fsub", from:from, to:to, flipped:true %} + + +.clear: +{% for r in (from..to) %} + eor v{{r}}.8b, v{{r}}.8b, v{{r}}.8b +{% endfor %} + b .non_linear_loop + +.leaky_relu: + add x2, x0, #8 + ld1 {v4.s}[0], [ x2 ] + dup v4.4s, v4.s[0] + + // bsl cond/dst, then, else + // fcmge dst, src, #0.0 + {% for r in (from..to) %} + fmul v0.4s, v{{r}}.4s, v4.4s + fcmge v1.4s, v{{r}}.4s, #0.0 + bsl v1.16b, v{{r}}.16b, v0.16b + and v{{r}}.16b, v1.16b, v1.16b + {% endfor %} + + b .non_linear_loop + +.q_scale: +.q_shl: +.q_shr: + b .unsupported diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_64x1.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_64x1.tmpl new file mode 100644 index 000000000..45192c8a5 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_64x1.tmpl @@ -0,0 +1,180 @@ +// vim: ft=arm + +// C tile regs: +// - x19-x29 to preserve (but x19, x28, x29 not used) +// - d8..d15 to preserve +// - v16 to v31, no need to preserve + +// no preservation either for v0-v7... +// packed A buffering (2x8 values): alternating v0, v1 with v2, v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +.cpu generic+fp+simd +.global {{G}}arm64simd_mmm_i32_64x1_{{suffix}} +{{G}}arm64simd_mmm_i32_64x1_{{suffix}}: + +/* + prfm pldl1keep, [x1] + prfm pldl1keep, [x2] +*/ + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + stp x26, x27, [sp, #-16]! + + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldp x2, x4, [x0, #24] // b, packing + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + + cmp x4, #1 + beq .packed_packed_loop_1_i8i8 + +.packed_packed_loop_1: + ld1 {v9.s}[0], [ x2 ], 4 + + ld1 { v0.4s-v3.4s }, [ x1 ], #64 + ld1 { v4.4s-v7.4s }, [ x1 ], #64 + {% for reg in (0..3) %} + mla v{{reg | times: 2 | plus: 16 }}.4s, v{{reg | times:2}}.4s, v9.s[0] + mla v{{reg | times: 2 | plus: 17 }}.4s, v{{reg | times:2 | plus:1}}.4s, v9.s[0] + {% endfor %} + + ld1 { v0.4s-v3.4s }, [ x1 ], #64 + ld1 { v4.4s-v7.4s }, [ x1 ], #64 + {% for reg in (0..3) %} + mla v{{reg | times: 2 | plus: 24 }}.4s, v{{reg | times:2}}.4s, v9.s[0] + mla v{{reg | times: 2 | plus: 25 }}.4s, v{{reg | times:2 | plus:1}}.4s, v9.s[0] + {% endfor %} + + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +.packed_packed_loop_1_i8i8: + ld1 {v9.b}[0], [ x2 ], 1 + sshll v9.8h, v9.8b, 0 + + ld1 { v0.8b-v3.8b }, [ x1 ], #32 + ld1 { v4.8b-v7.8b }, [ x1 ], #32 + + {% for reg in (0..7) %} + sshll v10.8h, v{{reg}}.8b, 0 + smlal v{{reg | times: 2 | plus: 16 }}.4s, v10.4h, v9.h[0] + smlal2 v{{reg | times: 2 | plus: 17 }}.4s, v10.8h, v9.h[0] + {% endfor %} + + subs x3, x3, #1 + bne .packed_packed_loop_1_i8i8 + + b .non_linear_loop + +.add_unicast: + ldp x5, x6, [x0, #8] + ldp x7, x8, [x0, #24] + + cmp x8, #4 + beq non_linear_addc_i32 + + {% for reg in (16..31) %} + {% for lane in (0..3) %} + ld1 {v0.b}[{{lane}}], [ x5 ], x6 + {% endfor %} + sshll v0.8h, v0.8b, 0 + sshll v0.4s, v0.4h, 0 + add v{{reg}}.4s, v{{reg}}.4s, v0.4s + {% endfor %} + + b .non_linear_loop + +non_linear_addc_i32: + {% for reg in (16..31) %} + {% for lane in (0..3) %} + ld1 {v0.s}[{{lane}}], [ x5 ], x6 + {% endfor %} + add v{{reg}}.4s, v{{reg}}.4s, v0.4s + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldr x2, [x0, #8] + ldr x3, [x0, #16] + + ld1 { v15.s }[0], [ x3 ] + xtn v15.4h, v15.4s + + ld1 { v0.4s-v3.4s }, [ x2 ], #64 + ld1 { v4.4s-v7.4s }, [ x2 ], #64 + + {% for reg in (0..7) %} + xtn v{{reg}}.4h, v{{reg}}.4s + smlal v{{reg|plus: 16}}.4s, v{{reg}}.4h, v15.h[0] + {% endfor %} + + ld1 { v0.4s-v3.4s }, [ x2 ], #64 + ld1 { v4.4s-v7.4s }, [ x2 ], #64 + + {% for reg in (0..7) %} + xtn v{{reg}}.4h, v{{reg}}.4s + smlal v{{reg|plus: 24}}.4s, v{{reg}}.4h, v15.h[0] + {% endfor %} + + b .non_linear_loop + +{% include "arm64simd_mmm_i32_scalars.tmpliq" from:16, to:31 %} +{% include "arm64simd_mmm_i32_per_rows.tmpliq" mr:64, from:16, to:31 %} +{% include "arm64simd_mmm_i32_per_cols.tmpliq" mr:64, from:16, to:31 %} +{% include "arm64simd_mmm_i32_scale_q16_q31.tmpliq" %} +{% include "arm64simd_mmm_load_tile.tmpliq" from:16, to:31 %} + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + cmp x8, #4 + beq .store_strides_i32 + + {% for reg in (16..31) %} + {% for lane in (0..3) %} + st1 { v{{reg}}.b }[{{lane | times: 4}}], [ x5 ], x6 + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.store_strides_i32: + {% for reg in (16..31) %} + {% for lane in (0..3) %} + st1 { v{{reg}}.s }[{{lane}}], [ x5 ], x6 + {% endfor %} + {% endfor %} + + b .non_linear_loop + +.return: + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ldp x26, x27, [sp], #16 + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_8x8.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_8x8.tmpl new file mode 100644 index 000000000..5aae3dc91 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_8x8.tmpl @@ -0,0 +1,234 @@ +// vim: ft=arm + +// C tile regs: +// - x19-x29 to preserve (but x19, x28, x29 not used) +// - d8..d15 to preserve +// - v16 to v31, no need to preserve +// +// v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0] +// v16[1] v18[1] +// v16[2] v18[2] +// v16[3] v18[3] +// +// v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0] +// v17[1] v19[1] +// v17[2] v19[2] +// v17[3] v19[3] + +// no preservation either for v0-v7... +// packed A buffering (2x8 values): alternating v0, v1 with v2, v3 +// packed B buffering (2x8 values): alternating v4, v5 with v6, v7 + +.text +.align 4 + +.cpu generic+fp+simd +.global {{G}}arm64simd_mmm_i32_8x8_{{suffix}} +{{G}}arm64simd_mmm_i32_8x8_{{suffix}}: + +/* + prfm pldl1keep, [x1] + prfm pldl1keep, [x2] +*/ + stp x20, x21, [sp, #-16]! + stp x22, x23, [sp, #-16]! + stp x24, x25, [sp, #-16]! + stp x26, x27, [sp, #-16]! + + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + +{% include "dispatcher.tmpliq" %} + +.add_mat_mul: + ldp x2, x4, [x0, #24] // b, packing + ldp x3, x1, [x0, #8] // k, a + + cmp x3, #0 + beq .non_linear_loop + + cmp x4, #1 + beq .packed_packed_loop_1_i8i8 + +.packed_packed_loop_1: + + ld1 { v0.4s, v1.4s }, [ x1 ], #32 + ld1 { v4.4s, v5.4s }, [ x2 ], #32 + + mla v16.4s, v0.4s, v4.s[0] + mla v17.4s, v1.4s, v4.s[0] + mla v18.4s, v0.4s, v4.s[1] + mla v19.4s, v1.4s, v4.s[1] + + mla v20.4s, v0.4s, v4.s[2] + mla v21.4s, v1.4s, v4.s[2] + mla v22.4s, v0.4s, v4.s[3] + mla v23.4s, v1.4s, v4.s[3] + + mla v24.4s, v0.4s, v5.s[0] + mla v25.4s, v1.4s, v5.s[0] + mla v26.4s, v0.4s, v5.s[1] + mla v27.4s, v1.4s, v5.s[1] + + mla v28.4s, v0.4s, v5.s[2] + mla v29.4s, v1.4s, v5.s[2] + mla v30.4s, v0.4s, v5.s[3] + mla v31.4s, v1.4s, v5.s[3] + + subs x3, x3, #1 + bne .packed_packed_loop_1 + + b .non_linear_loop + +.packed_packed_loop_1_i8i8: + + ld1 { v0.8b }, [ x1 ], #8 + sshll v0.8h, v0.8b, 0 + ld1 { v4.8b }, [ x2 ], #8 + sshll v4.8h, v4.8b, 0 + + smlal v16.4s, v0.4h, v4.h[0] + smlal2 v17.4s, v0.8h, v4.h[0] + smlal v18.4s, v0.4h, v4.h[1] + smlal2 v19.4s, v0.8h, v4.h[1] + smlal v20.4s, v0.4h, v4.h[2] + smlal2 v21.4s, v0.8h, v4.h[2] + smlal v22.4s, v0.4h, v4.h[3] + smlal2 v23.4s, v0.8h, v4.h[3] + + smlal v24.4s, v0.4h, v4.h[4] + smlal2 v25.4s, v0.8h, v4.h[4] + smlal v26.4s, v0.4h, v4.h[5] + smlal2 v27.4s, v0.8h, v4.h[5] + smlal v28.4s, v0.4h, v4.h[6] + smlal2 v29.4s, v0.8h, v4.h[6] + smlal v30.4s, v0.4h, v4.h[7] + smlal2 v31.4s, v0.8h, v4.h[7] + + subs x3, x3, #1 + bne .packed_packed_loop_1_i8i8 + + b .non_linear_loop + +{% include "arm64simd_mmm_i32_scalars.tmpliq" from:16, to:31%} +{% include "arm64simd_mmm_i32_per_rows.tmpliq" mr:8, from:16, to:31%} +{% include "arm64simd_mmm_i32_per_cols.tmpliq" mr:8, from:16, to:31%} +{% include "arm64simd_mmm_load_tile.tmpliq" from:16, to:31 %} + +.add_unicast: + ldp x5, x6, [x0, #8] + ldp x7, x8, [x0, #24] + + cmp x8, #4 + beq non_linear_addc_i32 + + {% for col in (8..15) %} + mov x4, x5 + {% for reg in (0..1) %} + {% for lane in (0..3) %} + ld1 {v0.b}[{{lane}}], [ x4 ], x6 + {% endfor %} + sshll v0.8h, v0.8b, 0 + sshll v0.4s, v0.4h, 0 + add v{{col | times:2 | plus: reg}}.4s, v{{col | times:2 | plus: reg}}.4s, v0.4s + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +non_linear_addc_i32: + {% for col in (8..15) %} + mov x4, x5 + {% for reg in (0..1) %} + {% for lane in (0..3) %} + ld1 {v0.s}[{{lane}}], [ x4 ], x6 + {% endfor %} + add v{{col | times:2 | plus: reg}}.4s, v{{col | times:2 | plus: reg}}.4s, v0.4s + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.add_row_col_products: + ldr x2, [x0, #8] + ldr x3, [x0, #16] + + ld1 { v0.4s, v1.4s }, [ x2 ] + ld1 { v4.4s, v5.4s }, [ x3 ] + + xtn v0.4h, v0.4s + xtn v1.4h, v1.4s + xtn v4.4h, v4.4s + xtn v5.4h, v5.4s + + smlal v16.4s, v0.4h, v4.h[0] + smlal v17.4s, v1.4h, v4.h[0] + smlal v18.4s, v0.4h, v4.h[1] + smlal v19.4s, v1.4h, v4.h[1] + smlal v20.4s, v0.4h, v4.h[2] + smlal v21.4s, v1.4h, v4.h[2] + smlal v22.4s, v0.4h, v4.h[3] + smlal v23.4s, v1.4h, v4.h[3] + + smlal v24.4s, v0.4h, v5.h[0] + smlal v25.4s, v1.4h, v5.h[0] + smlal v26.4s, v0.4h, v5.h[1] + smlal v27.4s, v1.4h, v5.h[1] + smlal v28.4s, v0.4h, v5.h[2] + smlal v29.4s, v1.4h, v5.h[2] + smlal v30.4s, v0.4h, v5.h[3] + smlal v31.4s, v1.4h, v5.h[3] + + b .non_linear_loop + + {% include "arm64simd_mmm_i32_scale_q16_q31.tmpliq" %} + +.store: + ldp x5, x6, [x0, #8] // c base ptr, rsc + ldp x7, x8, [x0, #24] // csc, item_size + + cmp x8, #4 + beq .store_strides_i32 + + {% for col in (8..15) %} + mov x4, x5 + {% for reg in (0..1) %} + {% for lane in (0..3) %} + st1 { v{{col | times:2 | plus: reg}}.b }[{{lane|times:4}}], [ x4 ], x6 + {% endfor %} + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.store_strides_i32: + {% for col in (8..15) %} + mov x4, x5 + {% for reg in (0..1) %} + {% for lane in (0..3) %} + st1 { v{{col | times:2 | plus: reg}}.s }[{{lane}}], [ x4 ], x6 + {% endfor %} + {% endfor %} + add x5, x5, x7 + {% endfor %} + + b .non_linear_loop + +.return: + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ldp x26, x27, [sp], #16 + ldp x24, x25, [sp], #16 + ldp x22, x23, [sp], #16 + ldp x20, x21, [sp], #16 + + ret + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_cols.tmpliq new file mode 100644 index 000000000..d770611b1 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_cols.tmpliq @@ -0,0 +1,8 @@ +// vim: ft=arm + +{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_min", op:"smin", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_max", op:"smax", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_mul", op:"mul", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_add", op:"add", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_sub", op:"sub", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_sub_flipped", op:"sub", mr:mr, from:from, to:to, flipped: true %} diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_rows.tmpliq new file mode 100644 index 000000000..12fdf9d00 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_rows.tmpliq @@ -0,0 +1,8 @@ +// vim: ft=arm + +{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_min", op:"smin", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_max", op:"smax", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_mul", op:"mul", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_add", op:"add", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_sub", op:"sub", mr:mr, from:from, to:to %} +{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_sub_flipped", op:"sub", mr:mr, from:from, to:to, flipped: true %} diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scalars.tmpliq new file mode 100644 index 000000000..9bf2f8264 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scalars.tmpliq @@ -0,0 +1,31 @@ +// vim: ft=arm + +{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_min", op:"smin", from:from, to:to%} +{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_max", op:"smax", from:from, to:to%} +{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_mul", op:"mul", from:from, to:to%} +{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_add", op:"add", from:from, to:to%} +{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_sub", op:"sub", from:from, to:to%} +{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_sub_flipped", op:"sub", from:from, to:to, flipped:true%} + +.clear: +{% for r in (from..to) %} + eor v{{r}}.8b, v{{r}}.8b, v{{r}}.8b +{% endfor %} + b .non_linear_loop + +.leaky_relu: + add x2, x0, #8 + ld1 {v4.s}[0], [ x2 ] + dup v4.4s, v4.s[0] + + // bsl cond/dst, then, else + // fcmge dst, src, #0.0 + {% for r in (from..to) %} + mul v0.4s, v{{r}}.4s, v4.4s + cmge v1.4s, v{{r}}.4s, #0 + bsl v1.16b, v{{r}}.16b, v0.16b + and v{{r}}.16b, v1.16b, v1.16b + {% endfor %} + + b .non_linear_loop + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scale_q16_q31.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scale_q16_q31.tmpliq new file mode 100644 index 000000000..fec2f539d --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scale_q16_q31.tmpliq @@ -0,0 +1,267 @@ + +// vim: ft=arm + +.q_scale: + ldp x5, x6, [x0, #8] // x5: shift, x6: policy + add x2, x0, #24 + ld1r { v2.4s }, [x2] // v2.4s <- multiplier + + mov w3, #1 + ins v4.d[0], x3 + dup v4.2d, v4.d[0] // v4.2d <- 1 + + add x5, x5, #32 // add 32 to shift + neg x5, x5 // broadcast shift + ins v1.d[0], x5 + dup v1.2d, v1.d[0] // v1.2s <- -(shift + 32) + + cmp x6, 1 + beq .q_scale_rounding_zero + cmp x6, 2 + beq .q_scale_rounding_away + cmp x6, 3 + beq .q_scale_rounding_minus_inf + cmp x6, 4 + beq .q_scale_rounding_plus_inf + cmp x6, 5 + beq .q_scale_rounding_even + cmp x6, 6 + beq .q_scale_rounding_odd + + b .unsupported + +.q_scale_rounding_zero: + // rust: signum * ((abs + nudge2) >> shift + // asm: signum * (2*abs - 1) >>r (shift + 1) + + {% for q in (16..31) %} + cmlt v0.4s, v{{q}}.4s, #0 + abs v{{q}}.4s, v{{q}}.4s + sqdmull v8.2d, v{{q}}.2s, v2.2s + sqdmull2 v9.2d, v{{q}}.4s, v2.4s //mul without shift and store results in v8 and v9 + + sub v8.2d, v8.2d, v4.2d + sqrshl v8.2d, v8.2d, v1.2d + + sub v9.2d, v9.2d, v4.2d + sqrshl v9.2d, v9.2d, v1.2d + + uzp1 v{{q}}.4s, v8.4s, v9.4s //combine back + + neg v3.4s, v{{q}}.4s + bit v{{q}}.16b, v3.16b, v0.16b + {% endfor %} + + b .non_linear_loop + +.q_scale_rounding_away: // signum * (abs >> (shift-1) + 1 >> 1) + + {% for q in (16..31) %} + cmlt v0.4s, v{{q}}.4s, #0 + abs v{{q}}.4s, v{{q}}.4s + sqdmull v8.2d, v{{q}}.2s, v2.2s + sqdmull2 v9.2d, v{{q}}.4s, v2.4s //mul without shift and store results in v8 and v9 + + sqrshl v8.2d, v8.2d, v1.2d + sqrshl v9.2d, v9.2d, v1.2d + + uzp1 v{{q}}.4s, v8.4s, v9.4s //combine back + + neg v3.4s, v{{q}}.4s + bit v{{q}}.16b, v3.16b, v0.16b + {% endfor %} + + b .non_linear_loop + +.q_scale_rounding_minus_inf: // val >> shift + + {% for q in (16..31) %} + sqdmull v8.2d, v{{q}}.2s, v2.2s + sqdmull2 v9.2d, v{{q}}.4s, v2.4s //mul without shift and store results in v8 and v9 + + sub v8.2d, v8.2d, v4.2d + sqrshl v8.2d, v8.2d, v1.2d + + sub v9.2d, v9.2d, v4.2d + sqrshl v9.2d, v9.2d, v1.2d + + uzp1 v{{q}}.4s, v8.4s, v9.4s //combine back + {% endfor %} + + b .non_linear_loop + +.q_scale_rounding_plus_inf: // (val >> shift-1)+1 >>1 + + {% for q in (16..31) %} + sqdmull v8.2d, v{{q}}.2s, v2.2s + sqdmull2 v9.2d, v{{q}}.4s, v2.4s //mul without shift and store results in v8 and v9 + + sqrshl v8.2d, v8.2d, v1.2d + sqrshl v9.2d, v9.2d, v1.2d + + uzp1 v{{q}}.4s, v8.4s, v9.4s //combine back + {% endfor %} + + b .non_linear_loop + +.q_scale_rounding_even: // signum * ((abs >> shift-1) + (abs & 0x1) - 1 >> 1) + + {% for q in (16..31) %} + cmlt v0.4s, v{{q}}.4s, #0 + abs v{{q}}.4s, v{{q}}.4s + sqdmull v8.2d, v{{q}}.2s, v2.2s + sqdmull2 v9.2d, v{{q}}.4s, v2.4s //mul without shift and store results in v8 and v9 + + sqshl v3.2d, v8.2d, v1.2d // abs >> shift - 1 + and v3.16b, v3.16b, v4.16b // abs & 0x1 + sub v3.2d, v3.2d, v4.2d //nudge : -1 if we want to round down, 0 if up + + add v8.2d, v8.2d, v3.2d + sqrshl v8.2d, v8.2d, v1.2d + + sqshl v3.2d, v9.2d, v1.2d + and v3.16b, v3.16b, v4.16b + sub v3.2d, v3.2d, v4.2d //nudge : -1 if we want to round down, 0 if up + + add v9.2d, v9.2d, v3.2d + sqrshl v9.2d, v9.2d, v1.2d + + uzp1 v{{q}}.4s, v8.4s, v9.4s //combine back + + neg v3.4s, v{{q}}.4s + bit v{{q}}.16b, v3.16b, v0.16b + {% endfor %} + + b .non_linear_loop + +.q_scale_rounding_odd: // signum * ((abs >> shift-1) - (abs & 0x1) >> 1) + + {% for q in (16..31) %} + cmlt v0.4s, v{{q}}.4s, #0 + abs v{{q}}.4s, v{{q}}.4s + sqdmull v8.2d, v{{q}}.2s, v2.2s + sqdmull2 v9.2d, v{{q}}.4s, v2.4s //mul without shift and store results in v8 and v9 + + sqshl v3.2d, v8.2d, v1.2d + and v3.16b, v3.16b, v4.16b //nudge : -1 if we want to round down, 0 if up + + sub v8.2d, v8.2d, v3.2d + sqrshl v8.2d, v8.2d, v1.2d + + sqshl v3.2d, v9.2d, v1.2d + and v3.16b, v3.16b, v4.16b //nudge : -1 if we want to round down, 0 if up + + sub v9.2d, v9.2d, v3.2d + sqrshl v9.2d, v9.2d, v1.2d + + uzp1 v{{q}}.4s, v8.4s, v9.4s //combine back + + neg v3.4s, v{{q}}.4s + bit v{{q}}.16b, v3.16b, v0.16b + {% endfor %} + + b .non_linear_loop + +.q_shl: + ldr x5, [x0, #8] // x5: shift + ins v1.s[0], w5 + dup v1.4s, v1.s[0] // v1.4s <- shift + + {% for q in (16..31) %} + sqrshl v{{q}}.4s, v{{q}}.4s, v1.4s + {% endfor %} + b .non_linear_loop + +.q_shr: + ldp x5, x6, [x0, #8] // x5: shift, x6: policy + + mov w3, #1 + ins v4.s[0], w3 + dup v4.4s, v4.s[0] // v4.4d <- 1 + + neg w5, w5 // broadcast shift + ins v1.s[0], w5 + dup v1.4s, v1.s[0] // v1.4s <- -shift + + cmp x6, 1 + beq .q_shr_rounding_zero + cmp x6, 2 + beq .q_shr_rounding_away + cmp x6, 3 + beq .q_shr_rounding_minus_inf + cmp x6, 4 + beq .q_shr_rounding_plus_inf + cmp x6, 5 + beq .q_shr_rounding_even + cmp x6, 6 + beq .q_shr_rounding_odd + + b .unsupported + +.q_shr_rounding_zero: + // asm: signum * (abs >>r shift) + {% for q in (16..31) %} + cmlt v0.4s, v{{q}}.4s, #0 + abs v{{q}}.4s, v{{q}}.4s + + sub v{{q}}.4s, v{{q}}.4s, v4.4s + sqrshl v{{q}}.4s, v{{q}}.4s, v1.4s + + neg v3.4s, v{{q}}.4s + bit v{{q}}.16b, v3.16b, v0.16b + {% endfor %} + b .non_linear_loop + +.q_shr_rounding_away: + {% for q in (16..31) %} + cmlt v0.4s, v{{q}}.4s, #0 + abs v{{q}}.4s, v{{q}}.4s + + sqrshl v{{q}}.4s, v{{q}}.4s, v1.4s + + neg v3.4s, v{{q}}.4s + bit v{{q}}.16b, v3.16b, v0.16b + {% endfor %} + b .non_linear_loop + +.q_shr_rounding_minus_inf: + {% for q in (16..31) %} + sqneg v{{q}}.4s, v{{q}}.4s + sqrshl v{{q}}.4s, v{{q}}.4s, v1.4s + sqneg v{{q}}.4s, v{{q}}.4s + {% endfor %} + b .non_linear_loop + +.q_shr_rounding_plus_inf: + {% for q in (16..31) %} + sqrshl v{{q}}.4s, v{{q}}.4s, v1.4s + {% endfor %} + b .non_linear_loop + +.q_shr_rounding_even: + // sqrshl is round(+inf), sqshl trauncates + // we look at parity of result by truncation: if it's odd, we have nothing more to do, we go towards +inf + // if it's even, we need to nudge towards 0 by adding -1 + // => nudge = (x >>l shift) & 0x1 - 1 (>>l is sqshl) + // => result is (x + nudge) >>r shift (with sqrshl) + {% for q in (16..31) %} + sqshl v3.4s, v{{q}}.4s, v1.4s // trunc + and v3.16b, v3.16b, v4.16b + sub v3.4s, v3.4s, v4.4s + add v{{q}}.4s, v{{q}}.4s, v3.4s + + sqrshl v{{q}}.4s, v{{q}}.4s, v1.4s + {% endfor %} + b .non_linear_loop + +.q_shr_rounding_odd: + // here: nudge is -((x >>l shift) & 0x1) + {% for q in (16..31) %} + sqshl v3.4s, v{{q}}.4s, v1.4s // trunc + and v3.16b, v3.16b, v4.16b + neg v3.4s, v3.4s + add v{{q}}.4s, v{{q}}.4s, v3.4s + + sqrshl v{{q}}.4s, v{{q}}.4s, v1.4s + {% endfor %} + b .non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_load_tile.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_load_tile.tmpliq new file mode 100644 index 000000000..ac920b368 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_load_tile.tmpliq @@ -0,0 +1,10 @@ +// vim: ft=arm + +.load_tile: + ldr x2, [ x0, #8 ] + {% for reg in (from..to) %} + ld1 { v{{reg}}.4s }, [ x2 ], #16 + {% endfor %} + + b .non_linear_loop + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_sigmoid_f32_4n.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_sigmoid_f32_4n.tmpl new file mode 100644 index 000000000..84b927e3b --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_sigmoid_f32_4n.tmpl @@ -0,0 +1,206 @@ +// vim: ft=arm + +// no preservation either for v0-v7 and v16-v31 + +.text +.align 4 + +.cpu generic+fp+simd +.global {{G}}arm64simd_sigmoid_f32_4n_{{suffix}} +{{G}}arm64simd_sigmoid_f32_4n_{{suffix}}: + + cmp x1, #0 + beq .return + + adr x2, .coeffs_num + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2] + dup v5.4s, v0.s[0] // v5 <- low, broadcasted + dup v6.4s, v0.s[1] // v6 <- high, broadcasted + dup v7.4s, v3.s[1] // v7 <- 0.5, broadcasted + + cmp x1, #16 + blt .loop + +.loop4: + ld1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x0] + + fmax v16.4s, v16.4s, v5.4s + fmax v17.4s, v17.4s, v5.4s + fmax v18.4s, v18.4s, v5.4s + fmax v19.4s, v19.4s, v5.4s + + fmin v16.4s, v16.4s, v6.4s + fmin v17.4s, v17.4s, v6.4s + fmin v18.4s, v18.4s, v6.4s + fmin v19.4s, v19.4s, v6.4s // v16 <- x + + fmul v20.4s, v16.4s, v16.4s + fmul v21.4s, v17.4s, v17.4s + fmul v22.4s, v18.4s, v18.4s + fmul v23.4s, v19.4s, v19.4s // v20 <- x2 + + dup v24.4s, v0.s[3] + fmla v24.4s, v20.4s, v0.s[2] + dup v25.4s, v0.s[3] + fmla v25.4s, v21.4s, v0.s[2] + dup v26.4s, v0.s[3] + fmla v26.4s, v22.4s, v0.s[2] + dup v27.4s, v0.s[3] + fmla v27.4s, v23.4s, v0.s[2] + + dup v28.4s, v1.s[0] + fmla v28.4s, v20.4s, v24.4s + dup v29.4s, v1.s[0] + fmla v29.4s, v21.4s, v25.4s + dup v30.4s, v1.s[0] + fmla v30.4s, v22.4s, v26.4s + dup v31.4s, v1.s[0] + fmla v31.4s, v23.4s, v27.4s + + dup v24.4s, v1.s[1] + fmla v24.4s, v20.4s, v28.4s + dup v25.4s, v1.s[1] + fmla v25.4s, v21.4s, v29.4s + dup v26.4s, v1.s[1] + fmla v26.4s, v22.4s, v30.4s + dup v27.4s, v1.s[1] + fmla v27.4s, v23.4s, v31.4s + + dup v28.4s, v1.s[2] + fmla v28.4s, v20.4s, v24.4s + dup v29.4s, v1.s[2] + fmla v29.4s, v21.4s, v25.4s + dup v30.4s, v1.s[2] + fmla v30.4s, v22.4s, v26.4s + dup v31.4s, v1.s[2] + fmla v31.4s, v23.4s, v27.4s + + dup v24.4s, v1.s[3] + fmla v24.4s, v20.4s, v28.4s + dup v25.4s, v1.s[3] + fmla v25.4s, v21.4s, v29.4s + dup v26.4s, v1.s[3] + fmla v26.4s, v22.4s, v30.4s + dup v27.4s, v1.s[3] + fmla v27.4s, v23.4s, v31.4s + + dup v28.4s, v2.s[0] + fmla v28.4s, v20.4s, v24.4s + dup v29.4s, v2.s[0] + fmla v29.4s, v21.4s, v25.4s + dup v30.4s, v2.s[0] + fmla v30.4s, v22.4s, v26.4s + dup v31.4s, v2.s[0] + fmla v31.4s, v23.4s, v27.4s + + fmul v16.4s, v16.4s, v28.4s + fmul v17.4s, v17.4s, v29.4s + fmul v18.4s, v18.4s, v30.4s + fmul v19.4s, v19.4s, v31.4s // v16 <- numerator + + dup v24.4s, v2.s[2] + fmla v24.4s, v20.4s, v2.s[1] + dup v25.4s, v2.s[2] + fmla v25.4s, v21.4s, v2.s[1] + dup v26.4s, v2.s[2] + fmla v26.4s, v22.4s, v2.s[1] + dup v27.4s, v2.s[2] + fmla v27.4s, v23.4s, v2.s[1] + + dup v28.4s, v2.s[3] + fmla v28.4s, v20.4s, v24.4s + dup v29.4s, v2.s[3] + fmla v29.4s, v21.4s, v25.4s + dup v30.4s, v2.s[3] + fmla v30.4s, v22.4s, v26.4s + dup v31.4s, v2.s[3] + fmla v31.4s, v23.4s, v27.4s + + dup v24.4s, v3.s[0] + fmla v24.4s, v20.4s, v28.4s + dup v25.4s, v3.s[0] + fmla v25.4s, v21.4s, v29.4s + dup v26.4s, v3.s[0] + fmla v26.4s, v22.4s, v30.4s + dup v27.4s, v3.s[0] + fmla v27.4s, v23.4s, v31.4s // v24 denum + + fdiv v16.4s, v16.4s, v24.4s + fdiv v17.4s, v17.4s, v25.4s + fdiv v18.4s, v18.4s, v26.4s + fdiv v19.4s, v19.4s, v27.4s + + fadd v16.4s, v16.4s, v7.4s + fadd v17.4s, v17.4s, v7.4s + fadd v18.4s, v18.4s, v7.4s + fadd v19.4s, v19.4s, v7.4s + + st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x0], #64 + + subs x1, x1, #16 + cmp x1, #16 + bge .loop4 + + cmp x1, #0 + beq .return + +.loop: + ld1 { v16.4s }, [x0] + + fmax v16.4s, v16.4s, v5.4s + fmin v16.4s, v16.4s, v6.4s // v16 <- x + fmul v20.4s, v16.4s, v16.4s // v20 <- x2 + + dup v24.4s, v0.s[3] + fmla v24.4s, v20.4s, v0.s[2] + dup v28.4s, v1.s[0] + fmla v28.4s, v20.4s, v24.4s + dup v24.4s, v1.s[1] + fmla v24.4s, v20.4s, v28.4s + dup v28.4s, v1.s[2] + fmla v28.4s, v20.4s, v24.4s + dup v24.4s, v1.s[3] + fmla v24.4s, v20.4s, v28.4s + dup v28.4s, v2.s[0] + fmla v28.4s, v20.4s, v24.4s + fmul v16.4s, v16.4s, v28.4s // v16 <- numerator + + dup v24.4s, v2.s[2] + fmla v24.4s, v20.4s, v2.s[1] + dup v28.4s, v2.s[3] + fmla v28.4s, v20.4s, v24.4s + dup v24.4s, v3.s[0] + fmla v24.4s, v20.4s, v28.4s // v24 <- denum + + fdiv v16.4s, v16.4s, v24.4s + fadd v16.4s, v16.4s, v7.4s + + st1 { v16.4s }, [x0], #16 + + subs x1, x1, #4 + bne .loop + +.return: + ret + +.coeffs_num: + .float -18.6 // low + .float 18.6 // high + .float -4.433153405e-18 // alpha_13 + .float 1.169974371e-14 + + .float -1.875289645e-11 + .float 4.257889523e-8 + .float 0.00004811817576 + .float 0.008163842030 + + .float 0.2499999971 + .float 3.922935744e-6 // beta_6 + .float 0.001524872358 + .float 0.1159886749 + + .float 1.0 + .float 0.5 // + .float 0.0 // padding + .float 0.0 + diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_tanh_f32_4n.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_tanh_f32_4n.tmpl new file mode 100644 index 000000000..dc88569ac --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_tanh_f32_4n.tmpl @@ -0,0 +1,198 @@ +// vim: ft=arm + +// no preservation either for v0-v7 and v16-v31 + +.text +.align 4 + +.cpu generic+fp+simd +.global {{G}}arm64simd_tanh_f32_4n_{{suffix}} +{{G}}arm64simd_tanh_f32_4n_{{suffix}}: + + cmp x1, #0 + beq .return + + adr x2, .coeffs_num + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2] + dup v5.4s, v0.s[0] // v5 <- low, broadcasted + dup v6.4s, v0.s[1] // v6 <- high, broadcasted + + cmp x1, #16 + blt .loop + +.loop4: + ld1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x0] + + fmax v16.4s, v16.4s, v5.4s + fmax v17.4s, v17.4s, v5.4s + fmax v18.4s, v18.4s, v5.4s + fmax v19.4s, v19.4s, v5.4s + + fmin v16.4s, v16.4s, v6.4s + fmin v17.4s, v17.4s, v6.4s + fmin v18.4s, v18.4s, v6.4s + fmin v19.4s, v19.4s, v6.4s // v16 <- x + + fmul v20.4s, v16.4s, v16.4s + fmul v21.4s, v17.4s, v17.4s + fmul v22.4s, v18.4s, v18.4s + fmul v23.4s, v19.4s, v19.4s // v20 <- x2 + + dup v24.4s, v0.s[3] + fmla v24.4s, v20.4s, v0.s[2] + dup v25.4s, v0.s[3] + fmla v25.4s, v21.4s, v0.s[2] + dup v26.4s, v0.s[3] + fmla v26.4s, v22.4s, v0.s[2] + dup v27.4s, v0.s[3] + fmla v27.4s, v23.4s, v0.s[2] + + dup v28.4s, v1.s[0] + fmla v28.4s, v20.4s, v24.4s + dup v29.4s, v1.s[0] + fmla v29.4s, v21.4s, v25.4s + dup v30.4s, v1.s[0] + fmla v30.4s, v22.4s, v26.4s + dup v31.4s, v1.s[0] + fmla v31.4s, v23.4s, v27.4s + + dup v24.4s, v1.s[1] + fmla v24.4s, v20.4s, v28.4s + dup v25.4s, v1.s[1] + fmla v25.4s, v21.4s, v29.4s + dup v26.4s, v1.s[1] + fmla v26.4s, v22.4s, v30.4s + dup v27.4s, v1.s[1] + fmla v27.4s, v23.4s, v31.4s + + dup v28.4s, v1.s[2] + fmla v28.4s, v20.4s, v24.4s + dup v29.4s, v1.s[2] + fmla v29.4s, v21.4s, v25.4s + dup v30.4s, v1.s[2] + fmla v30.4s, v22.4s, v26.4s + dup v31.4s, v1.s[2] + fmla v31.4s, v23.4s, v27.4s + + dup v24.4s, v1.s[3] + fmla v24.4s, v20.4s, v28.4s + dup v25.4s, v1.s[3] + fmla v25.4s, v21.4s, v29.4s + dup v26.4s, v1.s[3] + fmla v26.4s, v22.4s, v30.4s + dup v27.4s, v1.s[3] + fmla v27.4s, v23.4s, v31.4s + + dup v28.4s, v2.s[0] + fmla v28.4s, v20.4s, v24.4s + dup v29.4s, v2.s[0] + fmla v29.4s, v21.4s, v25.4s + dup v30.4s, v2.s[0] + fmla v30.4s, v22.4s, v26.4s + dup v31.4s, v2.s[0] + fmla v31.4s, v23.4s, v27.4s + + fmul v16.4s, v16.4s, v28.4s + fmul v17.4s, v17.4s, v29.4s + fmul v18.4s, v18.4s, v30.4s + fmul v19.4s, v19.4s, v31.4s // v16 <- numerator + + dup v24.4s, v2.s[2] + fmla v24.4s, v20.4s, v2.s[1] + dup v25.4s, v2.s[2] + fmla v25.4s, v21.4s, v2.s[1] + dup v26.4s, v2.s[2] + fmla v26.4s, v22.4s, v2.s[1] + dup v27.4s, v2.s[2] + fmla v27.4s, v23.4s, v2.s[1] + + dup v28.4s, v2.s[3] + fmla v28.4s, v20.4s, v24.4s + dup v29.4s, v2.s[3] + fmla v29.4s, v21.4s, v25.4s + dup v30.4s, v2.s[3] + fmla v30.4s, v22.4s, v26.4s + dup v31.4s, v2.s[3] + fmla v31.4s, v23.4s, v27.4s + + dup v24.4s, v3.s[0] + fmla v24.4s, v20.4s, v28.4s + dup v25.4s, v3.s[0] + fmla v25.4s, v21.4s, v29.4s + dup v26.4s, v3.s[0] + fmla v26.4s, v22.4s, v30.4s + dup v27.4s, v3.s[0] + fmla v27.4s, v23.4s, v31.4s // v24 denum + + fdiv v16.4s, v16.4s, v24.4s + fdiv v17.4s, v17.4s, v25.4s + fdiv v18.4s, v18.4s, v26.4s + fdiv v19.4s, v19.4s, v27.4s + + st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x0], #64 + + subs x1, x1, #16 + cmp x1, #16 + bge .loop4 + + cmp x1, #0 + beq .return + +.loop: + ld1 { v16.4s }, [x0] + + fmax v16.4s, v16.4s, v5.4s + fmin v16.4s, v16.4s, v6.4s // v16 <- x + fmul v20.4s, v16.4s, v16.4s // v20 <- x2 + + dup v24.4s, v0.s[3] + fmla v24.4s, v20.4s, v0.s[2] + dup v28.4s, v1.s[0] + fmla v28.4s, v20.4s, v24.4s + dup v24.4s, v1.s[1] + fmla v24.4s, v20.4s, v28.4s + dup v28.4s, v1.s[2] + fmla v28.4s, v20.4s, v24.4s + dup v24.4s, v1.s[3] + fmla v24.4s, v20.4s, v28.4s + dup v28.4s, v2.s[0] + fmla v28.4s, v20.4s, v24.4s + fmul v16.4s, v16.4s, v28.4s // v16 <- numerator + + dup v24.4s, v2.s[2] + fmla v24.4s, v20.4s, v2.s[1] + dup v28.4s, v2.s[3] + fmla v28.4s, v20.4s, v24.4s + dup v24.4s, v3.s[0] + fmla v24.4s, v20.4s, v28.4s // v24 <- denum + + fdiv v16.4s, v16.4s, v24.4s + + st1 { v16.4s }, [x0], #16 + + subs x1, x1, #4 + bne .loop + +.return: + ret + +.coeffs_num: + .float -8.9 // low + .float 8.9 // high + .float -8.488492677e-14 // alpha_13 + .float 5.277853000e-11 + + .float -2.022500419e-8 + .float 0.00001115424833 + .float 0.003103950131 + .float 0.1308400453 + + .float 0.9999999934 + .float 0.0002546136580 // beta_6 + .float 0.02449515379 + .float 0.4641733162 + + .float 1.0 + .float 0 // padding + .float 0 // padding + .float 0 // padding diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/dispatcher.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/dispatcher.tmpliq new file mode 100644 index 000000000..150db4683 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/dispatcher.tmpliq @@ -0,0 +1,37 @@ +// vim: ft=arm + +.non_linear: + sub x0, x0, 40 + +.non_linear_loop: + add x0, x0, 40 + ldr x2, [x0] + + mov x4, #{{ jump_table | size }} + + cmp x2, #{{ jump_table | size }} + csel x2, x2, x4, lt + cmp x2, #0 + csel x2, x4, x2, lt + + adr x3, .jmp_table + add x3, x3, x2, LSL#2 + br x3 + +.jmp_table: +{% for j in jump_table %} + b .{{j}} +{% endfor %} + b .unsupported + + add x0, x2, #4000 + b .return + +.unsupported: + mov x0, #1 + b .return + +.done: + mov x0, 0 + b .return + diff --git a/vendor/tract-linalg-0.22.1/benches/arm32neon.rs b/vendor/tract-linalg-0.22.1/benches/arm32neon.rs new file mode 100644 index 000000000..4c5101d02 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/benches/arm32neon.rs @@ -0,0 +1,179 @@ +#![feature(asm)] +#![allow(dead_code, non_upper_case_globals, unused_macros, non_snake_case, unused_assignments)] + +use std::time::Instant; + +macro_rules! r2 { ($($stat:stmt)*) => { $( $stat )* $( $stat )* } } +macro_rules! r4 { ($($stat:stmt)*) => { r2!(r2!($($stat)*)) }} +macro_rules! r8 { ($($stat:stmt)*) => { r4!(r2!($($stat)*)) }} +macro_rules! r16 { ($($stat:stmt)*) => { r4!(r4!($($stat)*)) }} +macro_rules! r32 { ($($stat:stmt)*) => { r8!(r4!($($stat)*)) }} +macro_rules! r64 { ($($stat:stmt)*) => { r8!(r8!($($stat)*)) }} +macro_rules! r128 { ($($stat:stmt)*) => { r8!(r16!($($stat)*)) }} +macro_rules! r1024 { ($($stat:stmt)*) => { r8!(r128!($($stat)*)) }} +macro_rules! r4096 { ($($stat:stmt)*) => { r4!(r1024!($($stat)*)) }} + +const _F32: [f32; 1024] = [12.; 1024]; +const F32: *const f32 = _F32.as_ptr(); + +/* +fn ruin_cache() { +let _a = (0..1000000).collect::>(); +} +*/ + +macro_rules! b { + ($f: block, $inner_loop: expr, $measures: expr) => {{ + let mut values = Vec::with_capacity($measures); + for _ in 0..$measures { + // ruin_cache(); + let start = Instant::now(); + for _ in 0..$inner_loop { + unsafe { $f }; + } + values.push(start.elapsed()); + } + values.sort(); + values[$measures / 2].as_nanos() as f64 / 1e9 / $inner_loop as f64 + }}; +} + +fn main() { + let cycle = b!( + { + r1024!(asm!("orr r0, r0, r0", out("r0") _)); + }, + 1000, + 1000 + ) / 1024.; + let indep_fmla = b!( + { + r8!(asm!(" + vmla.f32 q0, q0, q0 + vmla.f32 q1, q1, q1 + vmla.f32 q2, q2, q2 + vmla.f32 q3, q3, q3 + vmla.f32 q4, q4, q4 + vmla.f32 q5, q5, q5 + vmla.f32 q6, q6, q6 + vmla.f32 q7, q7, q7 + ", out("q0") _, out("q1") _, out("q2") _, out("q3") _, out("q4") _, out("q5") _, out("q6") _, out("q7") _)); + }, + 1000, + 1000 + ) / 64.; + eprintln!("rcp tp: indep fmla: {}", indep_fmla / cycle); + let dep_accu_fmla = b!( + { + r16!(asm!(" + vmla.f32 q15, q0, q0 + vmla.f32 q15, q1, q1 + vmla.f32 q15, q2, q2 + vmla.f32 q15, q3, q3 + vmla.f32 q15, q4, q4 + vmla.f32 q15, q5, q5 + vmla.f32 q15, q6, q6 + vmla.f32 q15, q7, q7 + vmla.f32 q15, q8, q8 + vmla.f32 q15, q9, q9 + vmla.f32 q15, q10, q10 + vmla.f32 q15, q11, q11 + vmla.f32 q15, q12, q12 + vmla.f32 q15, q13, q13 + vmla.f32 q15, q14, q14 + ", out("q0") _, out("q1") _, out("q2") _, out("q3") _, out("q4") _, out("q5") _, out("q6") _, out("q7") _, + out("q8") _, out("q9") _, out("q10") _, out("q11") _, out("q12") _, out("q13") _, out("q14") _, out("q15") _)); + }, + 1000, + 1000 + ) / 16. + / 15.; + eprintln!("rcp tp: accu-dep fmla: {}", dep_accu_fmla / cycle); + let load_s_using_vld1_64 = b!( + { + let mut p = F32; + r16!(asm!(" + vld1.64 {{d0-d3}}, [{0}]! + vld1.64 {{d4-d7}}, [{0}]! + vld1.64 {{d8-d11}}, [{0}]! + vld1.64 {{d12-d15}}, [{0}]! + vld1.64 {{d16-d19}}, [{0}]! + vld1.64 {{d20-d23}}, [{0}]! + vld1.64 {{d24-d27}}, [{0}]! + vld1.64 {{d28-d31}}, [{0}]! + ", + inout(reg) p, + out("q0") _, out("q1") _, out("q2") _, out("q3") _, out("q4") _, out("q5") _, out("q6") _, out("q7") _, + out("q8") _, out("q9") _, out("q10") _, out("q11") _, out("q12") _, out("q13") _, out("q14") _, out("q15") _)); + }, + 1000, + 1000 + ) / 16. + / 64.; // each line load 8 s + eprintln!("rcp tp: load s using vld1_64 ia {}", load_s_using_vld1_64 / cycle); + let load_s_using_vldm_q = b!( + { + let mut p = F32; + r16!(asm!(" + vldm {0}!, {{q0-q3}} + vldm {0}!, {{q4-q7}} + vldm {0}!, {{q8-q11}} + vldm {0}!, {{q12-q15}} + ", + inout(reg) p, + out("q0") _, out("q1") _, out("q2") _, out("q3") _, out("q4") _, out("q5") _, out("q6") _, out("q7") _, + out("q8") _, out("q9") _, out("q10") _, out("q11") _, out("q12") _, out("q13") _, out("q14") _, out("q15") _)); + }, + 1000, + 1000 + ) / 16. + / 64.; + eprintln!("rcp tp: load s using vldmia q: {}", load_s_using_vldm_q / cycle); + let load = b!( + { + let mut p = F32; + r16!(asm!(" + vldr.64 d0, [{0}] + vldr.64 d1, [{0}, #8] + vldr.64 d2, [{0}, #16] + vldr.64 d3, [{0}, #24] + vldr.64 d4, [{0}, #32] + vldr.64 d5, [{0}, #40] + vldr.64 d6, [{0}, #48] + vldr.64 d7, [{0}, #56] + vldr.64 d8, [{0}, #64] + vldr.64 d9, [{0}, #72] + vldr.64 d10, [{0}, #80] + vldr.64 d11, [{0}, #88] + vldr.64 d12, [{0}, #96] + vldr.64 d13, [{0}, #104] + vldr.64 d14, [{0}, #112] + vldr.64 d15, [{0}, #120] + vldr.64 d16, [{0}, #128] + vldr.64 d17, [{0}, #136] + vldr.64 d18, [{0}, #144] + vldr.64 d19, [{0}, #152] + vldr.64 d20, [{0}, #160] + vldr.64 d21, [{0}, #168] + vldr.64 d22, [{0}, #176] + vldr.64 d23, [{0}, #184] + vldr.64 d24, [{0}, #192] + vldr.64 d25, [{0}, #200] + vldr.64 d26, [{0}, #208] + vldr.64 d27, [{0}, #216] + vldr.64 d28, [{0}, #224] + vldr.64 d29, [{0}, #232] + vldr.64 d30, [{0}, #240] + vldr.64 d31, [{0}, #248] + add {0}, #256 + ", + inout(reg) p, + out("q0") _, out("q1") _, out("q2") _, out("q3") _, out("q4") _, out("q5") _, out("q6") _, out("q7") _, + out("q8") _, out("q9") _, out("q10") _, out("q11") _, out("q12") _, out("q13") _, out("q14") _, out("q15") _)); + }, + 1000, + 1000 + ) / 16. + / 64.; + eprintln!("rcp tp: load s using vldr d + imm: {}", load / cycle); +} diff --git a/vendor/tract-linalg-0.22.1/benches/arm64.rs b/vendor/tract-linalg-0.22.1/benches/arm64.rs new file mode 100644 index 000000000..c153dbdb4 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/benches/arm64.rs @@ -0,0 +1,77 @@ +use std::time::Instant; + +use tract_data::prelude::*; +use tract_linalg::LADatum; +use tract_linalg::frame::mmm::FusedSpec; +use tract_linalg::frame::mmm::MatMatMulKer; + +fn ruin_cache() { + let _a = (0..1000000).collect::>(); +} + +fn bench_to_nanos>( + k: usize, + loops: usize, +) -> f64 { + let item_size = T::datum_type().size_of(); + let a = Tensor::zero_aligned::( + &[(k + K::end_padding_packed_a()) * K::mr()], + K::alignment_bytes_packed_a(), + ) + .unwrap(); + let b = Tensor::zero_aligned::( + &[(k + K::end_padding_packed_b()) * K::nr()], + K::alignment_bytes_packed_b(), + ) + .unwrap(); + let mut c = Tensor::zero::(&[K::mr() * K::nr()]).unwrap(); + let ref a = InputStoreKer::Packed { ptr: unsafe { a.as_ptr_unchecked::() as _ } }; + let ref b = InputStoreKer::Packed { ptr: unsafe { b.as_ptr_unchecked::() as _ } }; + let ref c = OutputStoreKer { + ptr: unsafe { c.as_ptr_mut_unchecked::() as _ }, + item_size, + col_byte_stride: (item_size * K::mr()) as isize, + row_byte_stride: item_size as isize, + }; + let ref linear = LinearSpec::Mul { k }; + let op = MatMatMulKerSpec { a, b, c, linear, non_linear: std::ptr::null() }; + let mut values = Vec::with_capacity(loops); + for _ in 0..loops { + ruin_cache(); + let start = Instant::now(); + K::kernel(&op); + values.push(start.elapsed()); + } + values.sort(); + values[loops / 2].as_nanos() as f64 +} + +fn model>() -> (f64, f64) { + let x = 1000; + let zp = bench_to_nanos::(0, 10000); + let y = bench_to_nanos::(x, 1000); + let slope = (y - zp) / x as f64; + (slope, zp) +} + +fn as_match_line>() { + let coeffs = model::(); + println!( + "({:?}, {}, {}) => {} * k + {},", + K::name(), + K::mr(), + K::nr(), + (coeffs.0 * 1000.).round(), + (coeffs.1 * 1000.).round() + ); +} + +fn main() { + use tract_linalg::arm64::*; + as_match_line::(); + as_match_line::(); + as_match_line::(); + as_match_line::(); + as_match_line::(); + as_match_line::(); +} diff --git a/vendor/tract-linalg-0.22.1/benches/arm64simd.rs b/vendor/tract-linalg-0.22.1/benches/arm64simd.rs new file mode 100644 index 000000000..1dd244ff1 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/benches/arm64simd.rs @@ -0,0 +1,926 @@ +#![allow(dead_code, non_upper_case_globals, unused_macros, non_snake_case, unused_assignments)] + +use std::arch::asm; + +mod nano; + +#[repr(C, align(8))] +struct Floats([f32; 4096]); +const _F32: Floats = Floats([12.; 4096]); +const F32: *const f32 = (&_F32) as *const Floats as *const f32; + +lazy_static::lazy_static! { + static ref TICK: f64 = unsafe { b8192!(asm!("orr x20, x20, x20", out("x20") _)) }; +} + +pub unsafe fn armv8(filter: Option<&str>) { + macro_rules! s32 { + ($label: literal, $n: expr, $stmt:block) => { + if $label.contains(filter.unwrap_or("")) { + println!("{:40} {:.2}", $label, b32!($stmt) / $n as f64 / *TICK); + } + }; + } + + macro_rules! s128 { + ($label: literal, $n: expr, $stmt:block) => { + if $label.contains(filter.unwrap_or("")) { + println!("{:40} {:.2}", $label, b128!($stmt) / $n as f64 / *TICK); + } + }; + } + + macro_rules! s1024 { + ($label: literal, $n: expr, $stmt:block) => { + if $label.contains(filter.unwrap_or("")) { + println!("{:40} {:.2}", $label, b1024!($stmt) / $n as f64 / *TICK); + } + }; + } + + macro_rules! s8192 { + ($label: literal, $n: expr, $stmt:block) => { + if $label.contains(filter.unwrap_or("")) { + println!("{:40} {:.2}", $label, b8192!($stmt) / $n as f64 / *TICK); + } + }; + } + + s128!("nop", 1, { asm!("nop") }); + s128!("vands", 4, { + asm!(" and v0.16b, v1.16b, v1.16b + and v2.16b, v3.16b, v3.16b + and v4.16b, v5.16b, v5.16b + and v6.16b, v7.16b, v7.16b ", + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + ) + }); + s128!("fmax", 4, { + asm!(" fmax v0.4s, v1.4s, v1.4s + fmax v2.4s, v3.4s, v3.4s + fmax v4.4s, v5.4s, v5.4s + fmax v6.4s, v7.4s, v7.4s ", + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + ) + }); + s128!("fmax_with_dep", 1, { asm!("fmax v0.4s, v0.4s, v0.4s", out("v0") _) }); + s128!("fmla", 16, { + asm!(" fmla v0.4s, v0.4s, v0.4s + fmla v1.4s, v1.4s, v1.4s + fmla v2.4s, v2.4s, v2.4s + fmla v3.4s, v3.4s, v3.4s + fmla v4.4s, v4.4s, v4.4s + fmla v5.4s, v5.4s, v5.4s + fmla v6.4s, v6.4s, v6.4s + fmla v7.4s, v7.4s, v7.4s + fmla v8.4s, v8.4s, v8.4s + fmla v9.4s, v9.4s, v9.4s + fmla v10.4s,v10.4s,v10.4s + fmla v11.4s,v11.4s,v11.4s + fmla v12.4s,v12.4s,v12.4s + fmla v13.4s,v13.4s,v13.4s + fmla v14.4s,v14.4s,v14.4s + fmla v15.4s,v15.4s,v15.4s ", + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + ) + }); + + s128!("fmla_with_dep", 1, { asm!("fmla v0.4s, v0.4s, v0.4s", out("v0") _) }); + s32!("w_load", 64, { + let mut p = F32; + r8!(asm!("ldr w20, [{0}] + ldr w21, [{0}] + ldr w22, [{0}] + ldr w23, [{0}] + ldr w24, [{0}] + ldr w25, [{0}] + ldr w26, [{0}] + ldr w27, [{0}]", + inout(reg) p, + out("x20") _, out("x21") _, out("x22") _, out("x23") _, + out("x24") _, out("x25") _, out("x26") _, out("x27") _, + )); + }); + s32!("x_load", 64, { + let mut p = F32; + r8!(asm!(" + ldr x20, [{0}] + ldr x21, [{0}] + ldr x22, [{0}] + ldr x23, [{0}] + ldr x24, [{0}] + ldr x25, [{0}] + ldr x26, [{0}] + ldr x27, [{0}] + ", + inout(reg) p, + out("x20") _, out("x21") _, out("x22") _, out("x23") _, + out("x24") _, out("x25") _, out("x26") _, out("x27") _, + )); + }); + s32!("d_load", 64, { + let mut p = F32; + r8!(asm!(" + ldr d20, [{0}] + ldr d21, [{0}] + ldr d22, [{0}] + ldr d23, [{0}] + ldr d24, [{0}] + ldr d25, [{0}] + ldr d26, [{0}] + ldr d27, [{0}] + ", + inout(reg) p, + out("v20") _, out("v21") _, out("v22") _, out("v23") _, + out("v24") _, out("v25") _, out("v26") _, out("v27") _, + )); + }); + s32!("s_load", 64, { + let mut p = F32; + r8!(asm!(" + ld1 {{v20.s}}[0], [{0}] + ld1 {{v21.s}}[0], [{0}] + ld1 {{v22.s}}[0], [{0}] + ld1 {{v23.s}}[0], [{0}] + ld1 {{v24.s}}[0], [{0}] + ld1 {{v25.s}}[0], [{0}] + ld1 {{v26.s}}[0], [{0}] + ld1 {{v27.s}}[0], [{0}] + ", + inout(reg) p, + out("v20") _, out("v21") _, out("v22") _, out("v23") _, + out("v24") _, out("v25") _, out("v26") _, out("v27") _, + )); + }); + s32!("d_load_as_v", 64, { + let mut p = F32; + r8!(asm!(" + ld1 {{v20.d}}[0], [{0}] + ld1 {{v21.d}}[0], [{0}] + ld1 {{v22.d}}[0], [{0}] + ld1 {{v23.d}}[0], [{0}] + ld1 {{v24.d}}[0], [{0}] + ld1 {{v25.d}}[0], [{0}] + ld1 {{v26.d}}[0], [{0}] + ld1 {{v27.d}}[0], [{0}] + ", + inout(reg) p, + out("v20") _, out("v21") _, out("v22") _, out("v23") _, + out("v24") _, out("v25") _, out("v26") _, out("v27") _, + )); + }); + s32!("v_load", 64, { + let mut p = F32; + r8!(asm!(" + ld1 {{v20.4s}}, [{0}] + ld1 {{v21.4s}}, [{0}] + ld1 {{v22.4s}}, [{0}] + ld1 {{v23.4s}}, [{0}] + ld1 {{v24.4s}}, [{0}] + ld1 {{v25.4s}}, [{0}] + ld1 {{v26.4s}}, [{0}] + ld1 {{v27.4s}}, [{0}] + ", + inout(reg) p, + out("v20") _, out("v21") _, out("v22") _, out("v23") _, + out("v24") _, out("v25") _, out("v26") _, out("v27") _, + )); + }); + s32!("v2_load", 64, { + let mut p = F32; + r8!(asm!(" + ld1 {{v0.4s, v1.4s}}, [{0}] + ld1 {{v2.4s, v3.4s}}, [{0}] + ld1 {{v4.4s, v5.4s}}, [{0}] + ld1 {{v6.4s, v7.4s}}, [{0}] + ld1 {{v8.4s, v9.4s}}, [{0}] + ld1 {{v10.4s, v11.4s}}, [{0}] + ld1 {{v12.4s, v13.4s}}, [{0}] + ld1 {{v14.4s, v15.4s}}, [{0}] + ", + inout(reg) p, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + )); + }); + s32!("v3_load", 32, { + let mut p = F32; + r8!(asm!(" + ld1 {{v0.4s, v1.4s, v2.4s}}, [{0}] + ld1 {{v3.4s, v4.4s, v5.4s}}, [{0}] + ld1 {{v6.4s, v7.4s, v8.4s}}, [{0}] + ld1 {{v9.4s, v10.4s, v11.4s}}, [{0}] + ", + inout(reg) p, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + )); + }); + s32!("v4_load", 32, { + let mut p = F32; + r8!(asm!(" + ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{0}] + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{0}] + ld1 {{v8.4s, v9.4s, v10.4s, v11.4s}}, [{0}] + ld1 {{v12.4s, v13.4s, v14.4s, v15.4s}}, [{0}] + ", + inout(reg) p, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + )); + }); + s32!("ins_32b", 64, { + r8!(asm!(" + ins v8.s[0], w20 + ins v9.s[0], w20 + ins v10.s[0], w20 + ins v11.s[0], w20 + ins v12.s[0], w20 + ins v13.s[0], w20 + ins v14.s[0], w20 + ins v15.s[0], w20 + ", + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + )); + }); + s32!("ins_32b_same_lane", 128, { + r8!(asm!(" + ins v0.s[0], w20 + ins v1.s[0], w20 + ins v4.s[0], w20 + ins v5.s[0], w20 + ins v0.s[1], w20 + ins v1.s[1], w20 + ins v4.s[1], w20 + ins v5.s[1], w20 + ins v0.s[2], w20 + ins v1.s[2], w20 + ins v4.s[2], w20 + ins v5.s[2], w20 + ins v0.s[3], w20 + ins v1.s[3], w20 + ins v4.s[3], w20 + ins v5.s[3], w20 + ", + out("v0") _, out("v1") _, out("v4") _, out("v5") _, + )); + }); + s32!("ins_64b", 64, { + r8!(asm!(" + ins v8.d[0], x20 + ins v9.d[0], x20 + ins v10.d[0], x20 + ins v11.d[0], x20 + ins v12.d[0], x20 + ins v13.d[0], x20 + ins v14.d[0], x20 + ins v15.d[0], x20 + ", + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + )); + }); + s32!("ins_64b_same_v", 64, { + r8!(asm!(" + ins v8.d[0], x20 + ins v8.d[1], x20 + ins v8.d[0], x20 + ins v8.d[1], x20 + ins v8.d[0], x20 + ins v8.d[1], x20 + ins v8.d[0], x20 + ins v8.d[1], x20 + ", + out("v8") _, + )); + }); + s32!("ins_64b_from_v", 64, { + r8!(asm!(" + ins v8.d[0], v9.d[0] + ins v8.d[1], v9.d[0] + ins v8.d[0], v9.d[1] + ins v8.d[1], v9.d[1] + ins v8.d[0], v9.d[0] + ins v8.d[1], v9.d[0] + ins v8.d[0], v9.d[1] + ins v8.d[1], v9.d[1] + ", + out("v8") _, + )); + }); + s32!("fmla_with_prfm", 64, { + let mut p = F32; + r8!(asm!(" + prfm pldl1keep, [{0}, #256] + fmla v0.4s, v0.4s, v0.4s + prfm pldl1keep, [{0}, #320] + fmla v1.4s, v1.4s, v1.4s + prfm pldl1keep, [{0}, #384] + fmla v2.4s, v2.4s, v2.4s + prfm pldl1keep, [{0}, #448] + fmla v3.4s, v3.4s, v3.4s + prfm pldl1keep, [{0}, #512] + fmla v4.4s, v4.4s, v4.4s + prfm pldl1keep, [{0}, #576] + fmla v5.4s, v5.4s, v5.4s + prfm pldl1keep, [{0}, #640] + fmla v6.4s, v6.4s, v6.4s + prfm pldl1keep, [{0}, #704] + fmla v7.4s, v7.4s, v7.4s + prfm pldl1keep, [{0}, #768] + ", + inout(reg) p, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + )); + }); + s32!("fmla_with_w_load", 64, { + let mut p = F32; + r8!(asm!(" + ldr w20, [{0}] + fmla v0.4s, v0.4s, v0.4s + ldr w21, [{0}] + fmla v1.4s, v1.4s, v1.4s + ldr w22, [{0}] + fmla v2.4s, v2.4s, v2.4s + ldr w23, [{0}] + fmla v3.4s, v3.4s, v3.4s + ldr w24, [{0}] + fmla v4.4s, v4.4s, v4.4s + ldr w25, [{0}] + fmla v5.4s, v5.4s, v5.4s + ldr w26, [{0}] + fmla v6.4s, v6.4s, v6.4s + ldr w27, [{0}] + fmla v7.4s, v7.4s, v7.4s + ", + inout(reg) p, + out("x20") _, out("x21") _, out("x22") _, out("x23") _, + out("x24") _, out("x25") _, out("x26") _, out("x27") _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + )); + }); + s32!("fmla_with_w_load_inc", 64, { + let mut p = F32; + r8!(asm!(" + ldr w20, [{0}], #4 + fmla v0.4s, v0.4s, v0.4s + ldr w21, [{0}], #4 + fmla v1.4s, v1.4s, v1.4s + ldr w22, [{0}], #4 + fmla v2.4s, v2.4s, v2.4s + ldr w23, [{0}], #4 + fmla v3.4s, v3.4s, v3.4s + ldr w24, [{0}], #4 + fmla v4.4s, v4.4s, v4.4s + ldr w25, [{0}], #4 + fmla v5.4s, v5.4s, v5.4s + ldr w26, [{0}], #4 + fmla v6.4s, v6.4s, v6.4s + ldr w27, [{0}], #4 + fmla v7.4s, v7.4s, v7.4s + ", + inout(reg) p, + out("x20") _, out("x21") _, out("x22") _, out("x23") _, + out("x24") _, out("x25") _, out("x26") _, out("x27") _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + )); + }); + s32!("fmla_with_w_load_inc_alt", 64, { + let mut p = F32; + let mut q = F32; + r8!(asm!(" + ldr w20, [{0}], #4 + fmla v0.4s, v0.4s, v0.4s + ldr w21, [{1}], #4 + fmla v1.4s, v1.4s, v1.4s + ldr w22, [{0}], #4 + fmla v2.4s, v2.4s, v2.4s + ldr w23, [{1}], #4 + fmla v3.4s, v3.4s, v3.4s + ldr w24, [{0}], #4 + fmla v4.4s, v4.4s, v4.4s + ldr w25, [{1}], #4 + fmla v5.4s, v5.4s, v5.4s + ldr w26, [{0}], #4 + fmla v6.4s, v6.4s, v6.4s + ldr w27, [{1}], #4 + fmla v7.4s, v7.4s, v7.4s + ", + inout(reg) p, inout(reg) q, + out("x20") _, out("x21") _, out("x22") _, out("x23") _, + out("x24") _, out("x25") _, out("x26") _, out("x27") _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + )); + }); + s32!("fmla_with_w_load_offset", 64, { + let mut p = F32; + r8!(asm!(" + ldr w20, [{0}] + fmla v0.4s, v0.4s, v0.4s + ldr w21, [{0}, #4] + fmla v1.4s, v1.4s, v1.4s + ldr w22, [{0}, #8] + fmla v2.4s, v2.4s, v2.4s + ldr w23, [{0}, #12] + fmla v3.4s, v3.4s, v3.4s + ldr w24, [{0}, #16] + fmla v4.4s, v4.4s, v4.4s + ldr w25, [{0}, #20] + fmla v5.4s, v5.4s, v5.4s + ldr w26, [{0}, #24] + fmla v6.4s, v6.4s, v6.4s + ldr w27, [{0}, #28] + fmla v7.4s, v7.4s, v7.4s + ", + inout(reg) p, + out("x20") _, out("x21") _, out("x22") _, out("x23") _, + out("x24") _, out("x25") _, out("x26") _, out("x27") _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + )); + }); + s32!("fmla_with_x_load", 64, { + let mut p = F32; + r8!(asm!(" + fmla v0.4s, v0.4s, v0.4s + ldr x20, [{0}] + fmla v1.4s, v1.4s, v1.4s + ldr x21, [{0}] + fmla v2.4s, v2.4s, v2.4s + ldr x22, [{0}] + fmla v3.4s, v3.4s, v3.4s + ldr x23, [{0}] + fmla v4.4s, v4.4s, v4.4s + ldr x24, [{0}] + fmla v5.4s, v5.4s, v5.4s + ldr x25, [{0}] + fmla v6.4s, v6.4s, v6.4s + ldr x26, [{0}] + fmla v7.4s, v7.4s, v7.4s + ldr x27, [{0}] + ", + inout(reg) p, + out("x20") _, out("x21") _, out("x22") _, out("x23") _, + out("x24") _, out("x25") _, out("x26") _, out("x27") _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + )); + }); + s32!("fmla_with_s_load", 64, { + let mut p = F32; + r8!(asm!(" + ldr s16, [{0}] + fmla v0.4s, v0.4s, v0.4s + ldr s17, [{0}] + fmla v1.4s, v1.4s, v1.4s + ldr s18, [{0}] + fmla v2.4s, v2.4s, v2.4s + ldr s19, [{0}] + fmla v3.4s, v3.4s, v3.4s + ldr s20, [{0}] + fmla v4.4s, v4.4s, v4.4s + ldr s21, [{0}] + fmla v5.4s, v5.4s, v5.4s + ldr s22, [{0}] + fmla v6.4s, v6.4s, v6.4s + ldr s23, [{0}] + fmla v7.4s, v7.4s, v7.4s + ", + inout(reg) p, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + )); + }); + s32!("fmla_with_d_load", 64, { + let mut p = F32; + r8!(asm!(" + ldr d16, [{0}] + fmla v0.4s, v0.4s, v0.4s + ldr d17, [{0}] + fmla v1.4s, v1.4s, v1.4s + ldr d18, [{0}] + fmla v2.4s, v2.4s, v2.4s + ldr d19, [{0}] + fmla v3.4s, v3.4s, v3.4s + ldr d20, [{0}] + fmla v4.4s, v4.4s, v4.4s + ldr d21, [{0}] + fmla v5.4s, v5.4s, v5.4s + ldr d22, [{0}] + fmla v6.4s, v6.4s, v6.4s + ldr d23, [{0}] + fmla v7.4s, v7.4s, v7.4s + ", + inout(reg) p, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + out("v16") _, out("v17") _, out("v18") _, out("v19") _, + out("v20") _, out("v21") _, out("v22") _, out("v23") _, + )); + }); + s32!("fmla_with_d_load_as_v", 64, { + let mut p = F32; + r8!(asm!(" + fmla v0.4s, v0.4s, v0.4s + ld1 {{ v9.d }}[0], [{0}] + fmla v1.4s, v1.4s, v1.4s + ld1 {{ v10.d }}[0], [{0}] + fmla v2.4s, v2.4s, v2.4s + ld1 {{ v11.d }}[0], [{0}] + fmla v3.4s, v3.4s, v3.4s + ld1 {{ v12.d }}[0], [{0}] + fmla v4.4s, v4.4s, v4.4s + ld1 {{ v13.d }}[0], [{0}] + fmla v5.4s, v5.4s, v5.4s + ld1 {{ v14.d }}[0], [{0}] + fmla v6.4s, v6.4s, v6.4s + ld1 {{ v15.d }}[0], [{0}] + fmla v7.4s, v7.4s, v7.4s + ld1 {{ v16.d }}[0], [{0}] + ", + inout(reg) p, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + )); + }); + s32!("fmla_with_v_load", 64, { + let mut p = F32; + r8!(asm!(" + fmla v0.4s, v0.4s, v0.4s + ld1 {{ v9.4s }}, [{0}] + fmla v1.4s, v1.4s, v1.4s + ld1 {{ v10.4s }}, [{0}] + fmla v2.4s, v2.4s, v2.4s + ld1 {{ v11.4s }}, [{0}] + fmla v3.4s, v3.4s, v3.4s + ld1 {{ v12.4s }}, [{0}] + fmla v4.4s, v4.4s, v4.4s + ld1 {{ v13.4s }}, [{0}] + fmla v5.4s, v5.4s, v5.4s + ld1 {{ v14.4s }}, [{0}] + fmla v6.4s, v6.4s, v6.4s + ld1 {{ v15.4s }}, [{0}] + fmla v7.4s, v7.4s, v7.4s + ld1 {{ v16.4s }}, [{0}] + ", + inout(reg) p, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + )); + }); + s32!("fmla_with_ins_32b", 64, { + r8!(asm!(" + fmla v0.4s, v0.4s, v0.4s + ins v8.s[0], w20 + fmla v1.4s, v1.4s, v1.4s + ins v9.s[0], w20 + fmla v2.4s, v2.4s, v2.4s + ins v10.s[0], w20 + fmla v3.4s, v3.4s, v3.4s + ins v11.s[0], w20 + fmla v4.4s, v4.4s, v4.4s + ins v12.s[0], w20 + fmla v5.4s, v5.4s, v5.4s + ins v13.s[0], w20 + fmla v6.4s, v6.4s, v6.4s + ins v14.s[0], w20 + fmla v7.4s, v7.4s, v7.4s + ins v15.s[0], w20 + ", + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + out("x20") _, + )); + }); + s32!("fmla_with_ins_64b", 64, { + r8!(asm!(" + fmla v0.4s, v0.4s, v0.4s + ins v8.d[0], x20 + fmla v1.4s, v1.4s, v1.4s + ins v9.d[0], x20 + fmla v2.4s, v2.4s, v2.4s + ins v10.d[0], x20 + fmla v3.4s, v3.4s, v3.4s + ins v11.d[0], x20 + fmla v4.4s, v4.4s, v4.4s + ins v12.d[0], x20 + fmla v5.4s, v5.4s, v5.4s + ins v13.d[0], x20 + fmla v6.4s, v6.4s, v6.4s + ins v14.d[0], x20 + fmla v7.4s, v7.4s, v7.4s + ins v15.d[0], x20 + ", + out("x20") _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + )); + }); + s32!("fmla_with_ins_64b_cross_parity", 64, { + r8!(asm!(" + fmla v0.4s, v0.4s, v0.4s + ins v9.d[0], x20 + fmla v1.4s, v1.4s, v1.4s + ins v10.d[0], x20 + fmla v2.4s, v2.4s, v2.4s + ins v11.d[0], x20 + fmla v3.4s, v6.4s, v3.4s + ins v12.d[0], x20 + fmla v4.4s, v4.4s, v4.4s + ins v13.d[0], x20 + fmla v5.4s, v5.4s, v5.4s + ins v14.d[0], x20 + fmla v6.4s, v6.4s, v6.4s + ins v15.d[0], x20 + fmla v7.4s, v7.4s, v7.4s + ins v8.d[0], x20 + ", + out("x20") _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + )); + }); + s32!("ins_32b_with_load_s", 64, { + let mut p = F32; + r8!(asm!(" + ldr s0, [{0}] + ins v8.d[0], x20 + ldr s1, [{0}] + ins v9.d[0], x20 + ldr s2, [{0}] + ins v10.d[0], x20 + ldr s3, [{0}] + ins v11.d[0], x20 + ldr s4, [{0}] + ins v12.d[0], x20 + ldr s5, [{0}] + ins v13.d[0], x20 + ldr s6, [{0}] + ins v14.d[0], x20 + ldr s7, [{0}] + ins v15.d[0], x20 + ", + inout(reg) p, + out("x20") _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + )); + }); + s32!("ins_32b_with_load_s_cross_parity", 64, { + let mut p = F32; + r8!(asm!(" + ldr s0, [{0}] + ins v9.d[0], x20 + ldr s1, [{0}] + ins v10.d[0], x20 + ldr s2, [{0}] + ins v11.d[0], x20 + ldr s3, [{0}] + ins v12.d[0], x20 + ldr s4, [{0}] + ins v13.d[0], x20 + ldr s5, [{0}] + ins v14.d[0], x20 + ldr s6, [{0}] + ins v15.d[0], x20 + ldr s7, [{0}] + ins v8.d[0], x20 + ", + inout(reg) p, + out("x20") _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + )); + }); +} + +fn has_asimdhp() -> bool { + std::fs::read_to_string("/proc/cpuinfo").unwrap().contains("asimdhp") +} + +#[target_feature(enable = "fp16")] +pub unsafe fn asimdhp(filter: Option<&str>) { + macro_rules! s32 { + ($label: literal, $n: expr, $stmt:block) => { + if $label.contains(filter.unwrap_or("")) { + println!("{:40} {:.2}", $label, b32!($stmt) / $n as f64 / *TICK); + } + }; + } + + s32!("fmlahp", 16, { + asm!(" fmla v0.8h, v0.8h, v0.8h + fmla v1.8h, v1.8h, v1.8h + fmla v2.8h, v2.8h, v2.8h + fmla v3.8h, v3.8h, v3.8h + fmla v4.8h, v4.8h, v4.8h + fmla v5.8h, v5.8h, v5.8h + fmla v6.8h, v6.8h, v6.8h + fmla v7.8h, v7.8h, v7.8h + fmla v8.8h, v8.8h, v8.8h + fmla v9.8h, v9.8h, v9.8h + fmla v10.8h,v10.8h,v10.8h + fmla v11.8h,v11.8h,v11.8h + fmla v12.8h,v12.8h,v12.8h + fmla v13.8h,v13.8h,v13.8h + fmla v14.8h,v14.8h,v14.8h + fmla v15.8h,v15.8h,v15.8h ", + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + ) + }); + + s32!("fcvt", 16, { + asm!(" fcvtn v0.4h, v0.4s + fcvtn v1.4h, v1.4s + fcvtn v2.4h, v2.4s + fcvtn v3.4h, v3.4s + fcvtn v4.4h, v4.4s + fcvtn v5.4h, v5.4s + fcvtn v6.4h, v6.4s + fcvtn v7.4h, v7.4s + fcvtn v8.4h, v8.4s + fcvtn v9.4h, v9.4s + fcvtn v10.4h, v10.4s + fcvtn v11.4h, v11.4s + fcvtn v12.4h, v12.4s + fcvtn v13.4h, v13.4s + fcvtn v14.4h, v14.4s + fcvtn v15.4h, v15.4s", + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + ) + }); + + s32!("fcvt2", 16, { + asm!(" fcvtn2 v0.8h, v0.4s + fcvtn2 v1.8h, v1.4s + fcvtn2 v2.8h, v2.4s + fcvtn2 v3.8h, v3.4s + fcvtn2 v4.8h, v4.4s + fcvtn2 v5.8h, v5.4s + fcvtn2 v6.8h, v6.4s + fcvtn2 v7.8h, v7.4s + fcvtn2 v8.8h, v8.4s + fcvtn2 v9.8h, v9.4s + fcvtn2 v10.8h, v10.4s + fcvtn2 v11.8h, v11.4s + fcvtn2 v12.8h, v12.4s + fcvtn2 v13.8h, v13.4s + fcvtn2 v14.8h, v14.4s + fcvtn2 v15.8h, v15.4s", + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + ) + }); + + s32!("fmlahp_with_dep", 1, { asm!("fmla v0.8h, v0.8h, v0.8h", out("v0") _) }); + s32!("fcvtn_with_dep", 1, { asm!("fcvtn v0.4h, v0.4s", out("v0") _) }); + s32!("fcvtn2_with_dep", 1, { asm!("fcvtn2 v0.8h, v0.4s", out("v0") _) }); +} + +macro_rules! ksimd { + ($filter: expr, $vector_size: expr, $geo: literal, $n: expr, $path: literal) => { + kloop!($filter, $vector_size, $geo, $n, "arm64simd", $path) + } +} + +macro_rules! kfp16 { + ($filter: expr, $vector_size: expr, $geo: literal, $n: expr, $path: literal) => { + kloop!($filter, $vector_size, $geo, $n, "arm64fp16", $path) + } +} + +macro_rules! kloop { + ($filter: expr, $vector_size: expr, $geo: literal, $n: expr, $dir: literal, $path: literal) => { + let label = $path.split("/").last().unwrap().split_once(".").unwrap().0; + let full_label = format!("{:8} {:40}", $geo, label); + if full_label.contains($filter.unwrap_or("")) { + let time = b2!({ + let mut p = F32; + let mut q = F32; + r4!(asm!(include_str!(concat!("../arm64/", $dir, "/", $path)), + inout("x1") p, inout("x2") q, out("x3") _, + out("x4") _, out("x5") _, out("x6") _, out("x7") _, + out("x8") _, out("x9") _, out("x10") _, out("x11") _, + out("x12") _, out("x13") _, out("x14") _, out("x15") _, + out("x20") _, out("x21") _, out("x22") _, out("x23") _, + out("x24") _, out("x25") _, out("x26") _, out("x27") _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + out("v16") _, out("v17") _, out("v18") _, out("v19") _, + out("v20") _, out("v21") _, out("v22") _, out("v23") _, + out("v24") _, out("v25") _, out("v26") _, out("v27") _, + out("v28") _, out("v29") _, out("v30") _, out("v31") _, + )); + }) / 4.; + println!("{} {:3.0}% ({:0.2}/{} cy)", full_label, $n as f64 / $vector_size as f64 / time * 100. * *TICK, time / *TICK, $n as f64 / $vector_size as f64); + } + } +} + +unsafe fn f32_8x8(f: Option<&str>) { + ksimd!(f, 4, "8x8x1xf32", 64, "arm64simd_mmm_f32_8x8/packed_packed_loop1/naive.tmpli"); + ksimd!(f, 4, "8x8x1xf32", 64, "arm64simd_mmm_f32_8x8/packed_packed_loop1/broken_chains.tmpli"); + ksimd!(f, 4, "8x8x1xf32", 64, "arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_no_preload.tmpli"); + ksimd!(f, 4, "8x8x1xf32", 64, "arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_preload.tmpli"); + ksimd!(f, 4, "8x8x1xf32", 64, "arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_no_preload.tmpli"); + ksimd!(f, 4, "8x8x1xf32", 64, "arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_preload.tmpli"); + ksimd!(f, 4, "8x8x2xf32", 128, "arm64simd_mmm_f32_8x8/packed_packed_loop2/broken_chains.tmpli"); + ksimd!(f, 4, "8x8x2xf32", 128, "arm64simd_mmm_f32_8x8/packed_packed_loop2/cortex_a55.tmpli"); +} + +unsafe fn f32_12x8(f: Option<&str>) { + ksimd!(f, 4, "12x8x1xf32", 96, "arm64simd_mmm_f32_12x8/packed_packed_loop1/naive.tmpli"); + ksimd!(f, 4, "12x8x1xf32", 96, "arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_no_preload.tmpli"); + ksimd!(f, 4, "12x8x1xf32", 96, "arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_preload.tmpli"); + ksimd!(f, 4, "12x8x1xf32", 96, "arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.tmpli"); + ksimd!(f, 4, "12x8x2xf32", 192, "arm64simd_mmm_f32_12x8/packed_packed_loop2/cortex_a55.tmpli"); +} + +unsafe fn f32_16x4(f: Option<&str>) { + ksimd!(f, 4, "16x4x1xf32", 64, "arm64simd_mmm_f32_16x4/packed_packed_loop1/naive.tmpli"); + ksimd!(f, 4, "16x4x1xf32", 64, "arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.tmpli"); + ksimd!(f, 4, "16x4x2xf32", 128, "arm64simd_mmm_f32_16x4/packed_packed_loop2/cortex_a55.tmpli"); +} + +unsafe fn f32_24x4(f: Option<&str>) { + ksimd!(f, 4, "24x4x1xf32", 96, "arm64simd_mmm_f32_24x4/packed_packed_loop1/naive.tmpli"); + ksimd!(f, 4, "24x4x1xf32", 96, "arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.tmpli"); + ksimd!(f, 4, "24x4x1xf32", 96, "arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a55.tmpli"); +} + +unsafe fn f32_64x1(f: Option<&str>) { + ksimd!(f, 4, "64x1x1xf32", 64, "arm64simd_mmm_f32_64x1/loop1/naive.tmpli"); + ksimd!(f, 4, "64x1x1xf32", 64, "arm64simd_mmm_f32_64x1/loop1/cortex_a53.tmpli"); + ksimd!(f, 4, "64x1x2xf32", 128, "arm64simd_mmm_f32_64x1/loop2/naive.tmpli"); + ksimd!(f, 4, "64x1x2xf32", 128, "arm64simd_mmm_f32_64x1/loop2/cortex_a55.tmpli"); +} + +// RUSTFLAGS="-C target-feature=+fp16" cargo +nightly dinghy -d khadas-paris bench --bench arm64simd +#[target_feature(enable = "fp16")] +unsafe fn f16_16x8(f: Option<&str>) { + kfp16!(f, 8, "16x8x1xf16", 128, "arm64fp16_mmm_f16_16x8/loop1/naive.tmpli"); + kfp16!(f, 8, "16x8x2xf16", 256, "arm64fp16_mmm_f16_16x8/loop2/cortex_a55.tmpli"); + kfp16!(f, 8, "32x4x1xf16", 128, "arm64fp16_mmm_f16_32x4/loop1/naive.tmpli"); + kfp16!(f, 8, "32x4x2xf16", 256, "arm64fp16_mmm_f16_32x4/loop2/cortex_a55.tmpli"); +} + +fn main() { + println!("freq {:.2}GHz\n", 1e-9 / *TICK); + + let filter = std::env::args().skip(1).filter(|a| a != "--bench").next(); + unsafe { + armv8(filter.as_deref()); + if has_asimdhp() { + asimdhp(filter.as_deref()); + } + f32_8x8(filter.as_deref()); + f32_12x8(filter.as_deref()); + f32_16x4(filter.as_deref()); + f32_24x4(filter.as_deref()); + f32_64x1(filter.as_deref()); + f16_16x8(filter.as_deref()); + } +} diff --git a/vendor/tract-linalg-0.22.1/benches/intel.rs b/vendor/tract-linalg-0.22.1/benches/intel.rs new file mode 100644 index 000000000..d45dcf086 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/benches/intel.rs @@ -0,0 +1,200 @@ +#![allow(dead_code)] +use std::time::Instant; + +use tract_data::prelude::*; +use tract_linalg::frame::mmm::*; + + +fn ruin_cache() { + // return; + let _a = (0..1000000).collect::>(); +} + +pub fn reference(mr: usize, k: usize, nr: usize) -> Vec +where + T: Datum + Copy + num_traits::Zero + tract_linalg::LADatum, + K: MatMatMulKer, +{ + let mut vi = vec![0.0; k * nr]; + + for m in 0..mr { + for n in 0..nr { + for _ in 0..k { + let a: f32 = 1.0; + let b = 1.0; + let offset = { n + m * nr }; + vi[offset] += a * b; + } + } + } + vi +} + +fn bench_to_nanos< + T: Datum + Copy + num_traits::Zero + tract_linalg::LADatum, + K: MatMatMulKer, +>( + loops: usize, + m: usize, + n: usize, + k: usize, +) -> f64 { + let kernel = K::mmm(); + + let mut a = Tensor::zero_aligned::( + &[(k + K::end_padding_packed_a()) * m], + K::alignment_bytes_packed_a(), + ) + .unwrap(); + + let mut v = a.to_array_view_mut::().unwrap(); + v += 1.0; + let mut b = Tensor::zero_aligned::( + &[(k + K::end_padding_packed_b()) * n], + K::alignment_bytes_packed_b(), + ) + .unwrap(); + + let mut v = b.to_array_view_mut::().unwrap(); + v += 1.0; + let mut c = Tensor::zero::(&[n, m]).unwrap(); + + let ops = unsafe { + [ + FusedSpec::AddMatMul { + k, + a: kernel.a_packed(4, k).wrap(&a.view()), + b: kernel.b_packed(4, k).wrap(&b.view()), + }, + // FusedSpec::AddUnicast(kernel.c_view(1, 0).wrap(&c.view_mut())), + FusedSpec::Store(kernel.c_view(1, 0).wrap(&c.view_mut())), + ] + }; + + let mut values = Vec::with_capacity(loops); + + for _ in 0..loops { + ruin_cache(); + let start = Instant::now(); + unsafe { kernel.run(m, n, &ops).unwrap() }; + values.push(start.elapsed()); + } + + eprintln!("{:?} -> {:?}", values.first().unwrap(), values.last().unwrap()); + + values.sort(); + values[loops / 2].as_nanos() as f64 +} + +fn model>( +) -> (f64, f64) { + let x = 1000; + let zp = bench_to_nanos::(1000, K::mr() * 4, K::nr() * 4, 0); + let y = bench_to_nanos::(1000, K::mr() * 4, K::nr() * 4, x); + let slope = (y - zp) / x as f64; + (slope, zp) +} + +fn as_match_line>() { + let coeffs = model::(); + println!("({:?}, {}, {}) => {} * k + {}", K::name(), K::mr(), K::nr(), (coeffs.0), (coeffs.1),); +} + +fn main() { + + let core_id = core_affinity::get_core_ids().unwrap()[0]; + core_affinity::set_for_current(core_id); + // as_match_line::(); + // as_match_line::(); + // as_match_line::(); + // as_match_line::(); + // as_match_line::(); + // as_match_line::(); + // as_match_line::(); + // as_match_line::(); + // as_match_line::(); + + // mmv_perf_m(); + mmm_perf_batch_size(); +} + +// for mmv +fn mmv_perf_m() { + use tract_linalg::x86_64_fma::mmm::*; + let core_id = core_affinity::get_core_ids().unwrap()[0]; + core_affinity::set_for_current(core_id); + fn bench>( + m: usize, + ) { + let val = bench_to_nanos::(1000, m, 1, 100) / (m * 100) as f64; + print!("{val}\t"); + } + + print!("N\t"); + print!("fma_mmm_f32_64x1\t"); + print!("avx512_mmm_f32_128x1\t"); + print!("avx512_mmm_f32_16x1\t"); + println!(); + for n in 1..=128 { + eprintln!("{n}"); + print!("{n}\t"); + bench::(n); + bench::(n); + bench::(n); + println!(); + } +} + +// output a csv file with the perf of the kernels wrt batch size +fn mmm_perf_batch_size() { + use tract_linalg::x86_64_fma::mmm::*; + let core_id = core_affinity::get_core_ids().unwrap()[0]; + core_affinity::set_for_current(core_id); + fn bench>( + n: usize, + ) { + let val = + bench_to_nanos::(1000, K::mr() * 4, n, 100) / (K::mr() * 4 * 100 * n) as f64; + print!("{val}\t"); + } + + print!("N\t"); + print!("fma_mmm_f32_8x8\t"); + print!("fma_mmm_f32_16x6\t"); + print!("fma_mmm_f32_16x5\t"); + print!("fma_mmm_f32_24x4\t"); + print!("fma_mmm_f32_32x3\t"); + print!("fma_mmm_f32_40x2\t"); + print!("fma_mmm_f32_64x1\t"); + print!("avx512_mmm_f32_128x1\t"); + print!("avx512_mmm_f32_16x1\t"); + print!("avx512_mmm_f32_16x12\t"); + print!("avx512_mmm_f32_16x8\t"); + print!("avx512_mmm_f32_32x6\t"); + print!("avx512_mmm_f32_32x5\t"); + print!("avx512_mmm_f32_48x4\t"); + print!("avx512_mmm_f32_64x3\t"); + print!("avx512_mmm_f32_80x2\t"); + println!(); + for n in 1..=128 { + eprintln!("{n}"); + print!("{n}\t"); + bench::(n); + bench::(n); + bench::(n); + bench::(n); + bench::(n); + bench::(n); + bench::(n); + bench::(n); + bench::(n); + bench::(n); + bench::(n); + bench::(n); + bench::(n); + bench::(n); + bench::(n); + bench::(n); + println!(); + } +} diff --git a/vendor/tract-linalg-0.22.1/benches/leaky_relu.rs b/vendor/tract-linalg-0.22.1/benches/leaky_relu.rs new file mode 100644 index 000000000..31873fc68 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/benches/leaky_relu.rs @@ -0,0 +1,63 @@ +use criterion::*; +use tract_data::prelude::*; + +use tract_linalg::element_wise::ElementWiseKer; + +fn leaky_relu_f16(c: &mut Criterion) { + let mut group = c.benchmark_group("leaky_relu_f16"); + group.throughput(Throughput::Elements(1024)); + let mut input = unsafe { Tensor::uninitialized_aligned::(&[1024], 16).unwrap() }; + let input = input.as_slice_mut::().unwrap(); + let alpha = f16::from_f32(0.1); + group.bench_function("rust", |b| b.iter(|| rust_fp16(input, alpha))); + group.bench_function("rust_with_f16", |b| b.iter(|| unsafe { rust_with_fp16(input, alpha) })); + group.bench_function("linalg", |b| b.iter(|| linalg16(input, alpha))); + group.bench_function("linalg-asm", |b| b.iter(|| tract_linalg::arm64::arm64fp16_leaky_relu_f16_16n::run(input, alpha))); +} + +#[inline(never)] +fn rust_fp16(input: &mut [f16], alpha: f16) { + for x in input { + *x = if *x > f16::ZERO { *x } else { *x * alpha } + } +} + +#[target_feature(enable = "fp16")] +#[inline(never)] +unsafe fn rust_with_fp16(input: &mut [f16], alpha: f16) { + for x in input { + *x = if *x > f16::ZERO { *x } else { *x * alpha } + } +} + +#[inline(never)] +fn linalg16(input: &mut [f16], alpha: f16) { + (tract_linalg::ops().leaky_relu_f16)().run_with_params(input, alpha).unwrap(); +} + +fn leaky_relu_f32(c: &mut Criterion) { + let mut group = c.benchmark_group("leaky_relu_f32"); + group.throughput(Throughput::Elements(1024)); + let mut input = unsafe { Tensor::uninitialized_aligned::(&[1024], 16).unwrap() }; + let input = input.as_slice_mut::().unwrap(); + let alpha = 0.1f32; + group.bench_function("rust", |b| b.iter(|| rust_fp32(input, alpha))); + group.bench_function("linalg", |b| b.iter(|| linalg32(input, alpha))); + group.bench_function("linalg-asm", |b| b.iter(|| tract_linalg::arm64::arm64simd_leaky_relu_f32_8n::run(input, alpha))); +} + +#[inline(never)] +fn rust_fp32(input: &mut [f32], alpha: f32) { + for x in input { + *x = if *x > 0.0 { *x } else { *x * alpha } + } +} + +#[inline(never)] +fn linalg32(input: &mut [f32], alpha: f32) { + (tract_linalg::ops().leaky_relu_f32)().run_with_params(input, alpha).unwrap(); +} + + +criterion_group!(benches, leaky_relu_f32, leaky_relu_f16); +criterion_main!(benches); diff --git a/vendor/tract-linalg-0.22.1/benches/mat_vec.rs b/vendor/tract-linalg-0.22.1/benches/mat_vec.rs new file mode 100644 index 000000000..2c63bfc92 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/benches/mat_vec.rs @@ -0,0 +1,46 @@ +use criterion::*; +use tract_data::internal::*; +use tract_linalg::mmm::{AsInputValue, FusedSpec}; + +use DatumType::F32; + +fn mat_vec_mul(c: &mut Criterion) { + let mut group = c.benchmark_group("mat_vec_mul"); + unsafe { + { + let (m, k) = &(768usize, 256usize); + group.throughput(Throughput::Elements((m * k) as u64)); + group.bench_with_input( + BenchmarkId::from_parameter(format!("{m}x{k}")), + &(m, k), + |be, &(&m, &k)| { + let mmm = tract_linalg::ops().mmm(F32, Some(m), Some(k), Some(1)).unwrap(); + let packing = &mmm.packings()[0]; + let a = Tensor::zero::(&[m, k]).unwrap(); + let pa = packing.0.prepare_one(&a, 1, 0).unwrap(); + let b = Tensor::zero::(&[k, 1]).unwrap(); + let pb = packing.1.prepare_one(&b, 0, 1).unwrap(); + let mut c = Tensor::zero::(&[m]).unwrap(); + be.iter(move || { + mmm.run( + m, + 1, + &[ + FusedSpec::AddMatMul { + a: AsInputValue::Borrowed(&*pa), + b: AsInputValue::Borrowed(&*pb), + packing: 0, + }, + FusedSpec::Store(mmm.c_view(Some(0), Some(0)).wrap(&c.view_mut())), + ], + ) + }); + }, + ); + } + } + group.finish(); +} + +criterion_group!(benches, mat_vec_mul); +criterion_main!(benches); diff --git a/vendor/tract-linalg-0.22.1/benches/mm_for_asr_am.rs b/vendor/tract-linalg-0.22.1/benches/mm_for_asr_am.rs new file mode 100644 index 000000000..b350bed87 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/benches/mm_for_asr_am.rs @@ -0,0 +1,37 @@ +use criterion::*; + +mod utils; +use utils::*; + +fn all(c: &mut Criterion) { + // packed_packed: co, ci, n +// direct_conv(c, "asr_2M", 24, 5, 40, 200, 1); // lda + packed_packed(c, "asr_2M", 256, 200, 24); // tdnn1 +// direct_conv(c, "asr_2M", 24, 3, 256, 256, 1); // tdnn2 +// direct_conv(c, "asr_2M", 24, 3, 256, 256, 3); // tdnn3 + packed_packed(c, "asr_2M", 256, 256, 8); // fastlstm1 and 2 (input) x 8 (4 prod x 2 layers) + packed_packed(c, "asr_2M", 256, 128, 1); // fastlstm1 and 2 (hidden) x 64 (4 prod x 2 layers x 8 loops) + packed_packed(c, "asr_2M", 256, 256, 1); // fastlstm1 and 2 (rp) x 16 (2 layers x 8 loops) +// direct_conv(c, "asr_2M", 8, 3, 256, 256, 1); // tdnn4, tdd5 (x2) + packed_packed(c, "asr_2M", 1690, 256, 8); // output + + // 8M + packed_packed(c, "asr_8M", 512, 200, 24); // tdnn1 + packed_packed(c, "asr_8M", 512, 512, 24); // tdnn2 + packed_packed(c, "asr_8M", 512, 256, 1); // fastlstm1 and 2 (four parts, rec mat*vec) + packed_vec(c, "asr_8M", 512, 256, 1); // fastlstm1 and 2 (four parts, rec mat*vec) + + // pseudo 15M + packed_packed(c, "asr_pseudo15M", 768, 200, 24); // tdnn1 + packed_packed(c, "asr_pseudo15M", 768, 2304, 24); // tdnn2 + packed_packed(c, "asr_pseudo15M", 768, 2304, 8); // tdnn3,4,5 + packed_packed(c, "asr_pseudo15M", 768, 768, 8); // fastlstm1 and 2 (four parts, rec mat*mat) + packed_packed(c, "asr_pseudo15M", 768, 384, 1); // fastlstm1 and 2 (four parts, rec mat*vec) + packed_vec(c, "asr_pseudo15M", 768, 384, 1); // fastlstm1 and 2 (four parts, rec mat*vec) + + // 15M + packed_vec(c, "asr_15M", 768, 256, 1); // fastlstm1 and 2 (four parts, rec mat*vec) +} + +criterion_group!(benches, all); +criterion_main!(benches); diff --git a/vendor/tract-linalg-0.22.1/benches/mm_for_inception.rs b/vendor/tract-linalg-0.22.1/benches/mm_for_inception.rs new file mode 100644 index 000000000..218d85f0a --- /dev/null +++ b/vendor/tract-linalg-0.22.1/benches/mm_for_inception.rs @@ -0,0 +1,45 @@ +extern crate criterion; +use criterion::*; +use tract_data::internal::*; +use tract_linalg::mmm::{AsInputValue, FusedSpec}; + +use DatumType::F32; + +fn mat_mul_smmm(be: &mut criterion::Bencher, &(m, k, n): &(usize, usize, usize)) { + unsafe { + let mmm = tract_linalg::ops().mmm(F32, Some(m), Some(k), Some(n)).unwrap(); + let a = Tensor::zero::(&[m, k]).unwrap(); + let b = Tensor::zero::(&[k, n]).unwrap(); + let packing = &mmm.packings()[0]; + let pa = packing.0.prepare_one(&a, 1, 0).unwrap(); + let pb = packing.1.prepare_one(&b, 0, 1).unwrap(); + + let mut c = Tensor::zero::(&[m, n]).unwrap(); + be.iter(move || { + mmm.run( + m, + n, + &[ + FusedSpec::AddMatMul { + a: AsInputValue::Borrowed(&*pa), + b: AsInputValue::Borrowed(&*pb), + packing: 0, + }, + FusedSpec::Store(mmm.c_view(Some(0), Some(1)).wrap(&c.view_mut())), + ], + ) + }); + } +} + +fn mat_mul_prepacked(c: &mut Criterion, m: usize, k: usize, n: usize) { + let mut group = c.benchmark_group("mat_mul_prepacked"); + group.bench_function("smmm", |be| mat_mul_smmm(be, &(m, k, n))); +} + +fn s64x288x21609(c: &mut Criterion) { + mat_mul_prepacked(c, 64, 288, 21609) +} + +criterion::criterion_group!(benches, s64x288x21609); +criterion::criterion_main!(benches); diff --git a/vendor/tract-linalg-0.22.1/benches/mm_for_wavenet_hw.rs b/vendor/tract-linalg-0.22.1/benches/mm_for_wavenet_hw.rs new file mode 100644 index 000000000..060db5381 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/benches/mm_for_wavenet_hw.rs @@ -0,0 +1,12 @@ +use criterion::*; + +mod utils; +use utils::*; + +fn s16x60x8(c: &mut Criterion) { + packed_packed(c, "wavenet", 32, 32, 8); // postproc + packed_packed(c, "wavenet", 16, 60, 8); +} + +criterion_group!(benches, s16x60x8); +criterion_main!(benches); diff --git a/vendor/tract-linalg-0.22.1/benches/sigmoid.rs b/vendor/tract-linalg-0.22.1/benches/sigmoid.rs new file mode 100644 index 000000000..c9868b654 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/benches/sigmoid.rs @@ -0,0 +1,22 @@ +#[macro_use] +extern crate criterion; +extern crate tract_linalg; +use criterion::Criterion; + +fn ssigmoid(c: &mut Criterion, n: usize) { + c.bench_function(&format!("ssigmoid_{n}"), move |be| { + let mut s = (0..n).map(|i| i as f32 / 10.0).collect::>(); + let op = &(tract_linalg::ops().sigmoid_f32)(); + be.iter(|| op.run(&mut s)); + }); +} + +fn bs(c: &mut Criterion) { + ssigmoid(c, 4); + ssigmoid(c, 8); + ssigmoid(c, 128); + ssigmoid(c, 1024); +} + +criterion_group!(benches, bs); +criterion_main!(benches); diff --git a/vendor/tract-linalg-0.22.1/benches/softmax.rs b/vendor/tract-linalg-0.22.1/benches/softmax.rs new file mode 100644 index 000000000..87ce17360 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/benches/softmax.rs @@ -0,0 +1,110 @@ +use criterion::*; +use tract_data::prelude::*; +use tract_linalg::element_wise::ElementWiseKer; +use tract_linalg::generic::reduce::softmax_l2::SSoftMaxL2; +use tract_linalg::reduce::{MapReduceKer, ReduceKer}; + +#[inline(never)] +fn loop1_f32_naive(slice: &mut [f32]) -> f32 { + let mut max = f32::MIN; + for x in &*slice { + if *x > max { + max = *x; + } + } + max +} + +#[inline(never)] +fn loop2_f32(slice: &mut [f32], max: f32) -> f32 { + let mut sum = 0.; + for x in slice.iter_mut() { + *x = (*x - max).exp(); + sum += *x; + } + sum +} + +#[inline(never)] +fn loop3_f32(slice: &mut [f32], sum: f32) { + let recip = sum.recip(); + for x in slice { + *x *= recip; + } +} + +#[inline(never)] +fn rust_f32(slice: &mut [f32]) { + let max = loop1_f32_naive(slice); + let sum = loop2_f32(slice, max); + loop3_f32(slice, sum); +} + +fn softmax_f32(c: &mut Criterion) { + let mut group = c.benchmark_group("softmax_f32"); + group.throughput(Throughput::Elements(1500)); + let mut input = unsafe { Tensor::uninitialized_aligned::(&[1500], 16).unwrap() }; + let input = input.as_slice_mut::().unwrap(); + group.bench_function("rust", |b| b.iter(|| rust_f32(input))); + group.bench_function("loop1/naive", |b| b.iter(|| loop1_f32_naive(input))); + group.bench_function("loop1/generic", |b| { + b.iter(|| tract_linalg::generic::reduce::max::SMax4::red().run(input)) + }); + #[cfg(target_arch = "x86_64")] + group.bench_function("loop1/iasm", |b| { + b.iter(|| { + tract_linalg::x86_64_fma::max::x86_64_fma_max_f32_32n::red().run(input).unwrap(); + }) + }); + #[cfg(target_arch = "aarch64")] + group.bench_function("loop1/intr", |b| { + b.iter(|| { + tract_linalg::arm64::arm64simd_max_f32_16n::red().run(input).unwrap(); + }) + }); + group.bench_function("loop2/naive", |b| b.iter(|| loop2_f32(input, 1.0))); + group.bench_function("loop2/generic", |b| { + b.iter(|| SSoftMaxL2::red().run_with_params(input, 10.)) + }); + #[cfg(target_arch = "x86_64")] + group.bench_function("loop2/iasm", |b| { + b.iter(|| { + tract_linalg::x86_64_fma::softmax::x86_64_fma_softmax2_fastcompact_f32_32n::red() + .run_with_params(input, 10.) + .unwrap() + }); + }); + #[cfg(target_arch = "aarch64")] + group.bench_function("loop2/iasm", |b| { + b.iter(|| { + tract_linalg::arm64::arm64simd_softmax2_fastcompact_f32_16n::red() + .run_with_params(input, 0.21) + .unwrap() + }); + }); + group.bench_function("loop3/naive", |b| b.iter(|| loop3_f32(input, 0.21))); + group.bench_function("loop3/generic", |b| { + b.iter(|| { + tract_linalg::generic::by_scalar::SMulByScalar4::ew().run_with_params(input, 0.21) + }) + }); + #[cfg(target_arch = "x86_64")] + group.bench_function("loop3/iasm", |b| { + b.iter(|| { + tract_linalg::x86_64_fma::by_scalar::x86_64_avx_f32_mul_by_scalar_32n::ew() + .run_with_params(input, 0.21) + .unwrap() + }); + }); + #[cfg(target_arch = "aarch64")] + group.bench_function("loop3/iasm", |b| { + b.iter(|| { + tract_linalg::arm64::arm64simd_mul_by_scalar_f32_16n::ew() + .run_with_params(input, 0.21) + .unwrap() + }); + }); +} + +criterion_group!(benches, softmax_f32); +criterion_main!(benches); diff --git a/vendor/tract-linalg-0.22.1/benches/utils.rs b/vendor/tract-linalg-0.22.1/benches/utils.rs new file mode 100644 index 000000000..8d513a5dd --- /dev/null +++ b/vendor/tract-linalg-0.22.1/benches/utils.rs @@ -0,0 +1,92 @@ +#![allow(dead_code)] +use criterion::*; +use tract_data::internal::*; +use tract_linalg::mmm::{FusedSpec, MMMInputValue, MatMatMul}; + +use tract_linalg::mmm::AsInputValue; +use DatumType::*; + +pub fn packed_packed(c: &mut Criterion, name: &str, m: usize, k: usize, n: usize) { + let mut group = c.benchmark_group(format!("{name}/packed_packed")); + group.throughput(Throughput::Elements((m * k * n) as u64)); + let id = format!("{m}x{k}x{n}"); + group.bench_with_input(BenchmarkId::new("f32/cold", &id), &(F32, m, k, n, true), mat_mat); + group.bench_with_input(BenchmarkId::new("f32/hot", &id), &(F32, m, k, n, false), mat_mat); + group.bench_with_input(BenchmarkId::new("i8/cold", &id), &(I8, m, k, n, true), mat_mat); + group.bench_with_input(BenchmarkId::new("i8/hot", &id), &(I8, m, k, n, false), mat_mat); +} + +pub fn packed_vec(c: &mut Criterion, name: &str, m: usize, k: usize, n: usize) { + assert_eq!(n, 1); + let mut group = c.benchmark_group(format!("{name}/packed_vec")); + group.throughput(Throughput::Elements((m * k * n) as u64)); + let id = format!("{m}x{k}x{n}"); + group.bench_with_input(BenchmarkId::new("f32/cold", &id), &(F32, m, k, n, true), mat_mat); + group.bench_with_input(BenchmarkId::new("f32/hot", &id), &(F32, m, k, n, false), mat_mat); + group.bench_with_input(BenchmarkId::new("i8/cold", &id), &(I8, m, k, n, true), mat_mat); + group.bench_with_input(BenchmarkId::new("i8/hot", &id), &(I8, m, k, n, false), mat_mat); +} + +pub fn ruin_cache() { + let _a = (0..1000000).collect::>(); +} + +#[allow(clippy::too_many_arguments)] +unsafe fn run( + m: usize, + _k: usize, + n: usize, + be: &mut Bencher, + mmm: &dyn MatMatMul, + a: &dyn MMMInputValue, + b: &dyn MMMInputValue, + cold: bool, +) { + let mut scratch = unsafe { mmm.allocate_scratch_space() }; + be.iter_custom(move |iters| { + let mut dur = std::time::Duration::default(); + for _ in 0..iters { + if cold { + ruin_cache(); + } + let instant = std::time::Instant::now(); + unsafe { + mmm.run_with_scratch_space( + m, + n, + scratch.as_mut(), + &[FusedSpec::AddMatMul { + a: AsInputValue::Borrowed(a), + b: AsInputValue::Borrowed(b), + packing: 0, + }], + ) + .unwrap() + }; + let time = instant.elapsed(); + dur += time; + } + dur + }); +} + +fn mat_mat(be: &mut Bencher, params: &(DatumType, usize, usize, usize, bool)) { + let (dt, m, k, n, _) = *params; + let mm = tract_linalg::ops().mmm(dt, Some(m), Some(k), Some(n)).unwrap(); + mat_mat_with_mm(be, &*mm, params) +} + +pub fn mat_mat_with_mm( + be: &mut Bencher, + mmm: &dyn MatMatMul, + &(dt, m, k, n, cold): &(DatumType, usize, usize, usize, bool), +) { + let a = Tensor::zero_dt(dt, &[m, k]).unwrap(); + let b = Tensor::zero_dt(dt, &[k, n]).unwrap(); + let packing = &mmm.packings()[0]; + let pa = packing.0.prepare_one(&a, 1, 0).unwrap(); + let pb = packing.1.prepare_one(&b, 0, 1).unwrap(); + unsafe { + run(m, k, n, be, mmm, &*pa, &*pb, cold); + } +} diff --git a/vendor/tract-linalg-0.22.1/benches/virtual_im2col.rs b/vendor/tract-linalg-0.22.1/benches/virtual_im2col.rs new file mode 100644 index 000000000..8e7309042 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/benches/virtual_im2col.rs @@ -0,0 +1,47 @@ +use criterion::measurement::WallTime; +use criterion::*; +use tract_data::internal::*; + +#[allow(dead_code)] +#[path = "../tests/virtual_im2col.rs"] +mod virtual_im2col; +use virtual_im2col::ConvProblem; + +fn conv( + c: &mut BenchmarkGroup, + ci: usize, + h: usize, + w: usize, + co: usize, + kh: usize, + kw: usize, +) { + // CHW HWIO + let input = Tensor::zero::(&[ci, h, w]).unwrap(); + let filters = Tensor::zero::(&[kh, kw, ci, co]).unwrap(); + let mut cv = ConvProblem { input, filters, lazy_im2col: false }; + c.bench_function("eager", |b| { + b.iter(|| { + cv.tract().unwrap(); + }) + }); + cv.lazy_im2col = true; + c.bench_function("lazy", |b| { + b.iter(|| { + cv.tract().unwrap(); + }) + }); +} + +fn ex1(c: &mut Criterion) { + let mut c = c.benchmark_group("ex1"); + conv(&mut c, 32, 256, 256, 32, 3, 3); +} + +fn big(c: &mut Criterion) { + let mut c = c.benchmark_group("big"); + conv(&mut c, 1, 1024, 1024, 99, 3, 3); +} + +criterion_group!(benches, ex1, big); +criterion_main!(benches); diff --git a/vendor/tract-linalg-0.22.1/benches/x86_64.rs b/vendor/tract-linalg-0.22.1/benches/x86_64.rs new file mode 100644 index 000000000..2da68f812 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/benches/x86_64.rs @@ -0,0 +1,242 @@ +#![allow(dead_code, non_upper_case_globals, unused_macros, non_snake_case, unused_assignments)] + +use std::arch::asm; + +mod nano; + +#[repr(C, align(64))] +struct Floats([f32; 256 * 1024 * 64]); +const _F32: Floats = Floats([12.; 256 * 1024 * 64]); +const F32: *const f32 = (&_F32) as *const Floats as *const f32; + +lazy_static::lazy_static! { + static ref TICK: f64 = unsafe { b8192!(asm!("or rax, rax", out("rax") _)) }; +} + +macro_rules! kloop { + ($filter: expr, $geo: literal, $n: expr, $path: literal, $ww: expr, $u: expr, $arch: expr) => { + let label = $path.split("/").last().unwrap().split_once(".").unwrap().0; + let full_label = format!("{:8} {:40}", $geo, label); + let repeats = 32; + let ks = 256; + if full_label.contains($filter.unwrap_or("")) { + let time = b1!({ + + let mut p = F32; + let mut q = F32; + let mut k = ks; + let mut r = repeats; + asm!( + concat!(r#" +2: + mov rax, r9 + mov rcx, r10 + mov r8, r12 +3: + "#, include_str!( concat!("../x86_64/", $arch, "/", $path)), "\n sub r8, ", $u, r#" +jnz 3b + +sub r11, 1 +jnz 2b +"#), + inout("r9") p, inout("r10") q, inout("r12") k, inout("r11") r, out("rax") _, out("rcx") _, + out("r8") _, + out("zmm0") _, out("zmm1") _, out("zmm2") _, out("zmm3") _, + out("zmm4") _, out("zmm5") _, out("zmm6") _, out("zmm7") _, + out("zmm8") _, out("zmm9") _, out("zmm10") _, out("zmm11") _, + out("zmm12") _, out("zmm13") _, out("zmm14") _, out("zmm15") _, + out("zmm20") _, out("zmm21") _, out("zmm22") _, out("zmm23") _, + out("zmm24") _, out("zmm25") _, out("zmm26") _, out("zmm27") _, + out("zmm28") _, out("zmm29") _, out("zmm30") _, out("zmm31") _, + ); + }); + + // We have k=1024 * 64 but some tests step twice per iteration + let iterations = (ks * repeats / $u); + // Those that step twice process twice as many elements per iteration + let elems_per_iteration = $n * $u; + + let time_per_iteration = time / iterations as f64; + + let total_floats = elems_per_iteration * iterations; + let flops = total_floats as f64 / time; + + let total_time_ms = time * 1e6; + let fmas_per_iteration = ($n as f64 / $ww as f64) * $u as f64; + let ticks_per_iteration = time_per_iteration / *TICK; + println!("{} {:3.5} {:3.0}% ({:>5.2 }/{:3 } cy) {:.2} GFLOP/s", full_label, total_time_ms, fmas_per_iteration / ticks_per_iteration * 100., ticks_per_iteration, fmas_per_iteration, flops / 1e9 ); + } + }; + + ($filter: expr, $geo: literal, $n: expr, $path: literal, $ww: expr) => { + kloop!($filter, $geo, $n, $path, $ww, 1, "fma") + }; + ($filter: expr, $geo: literal, $n: expr, $path: literal, $ww: expr, $u: expr) => { + kloop!($filter, $geo, $n, $path, $ww, $u, "fma") + }; +} + +unsafe fn packed_packed_1x12(f: Option<&str>) { + println!("-- 1x12 kernels"); + if std::is_x86_feature_detected!("avx512f") { + kloop!(f, "1x12x1", (16 * 1 * 12), "1x12/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512"); + } + println!(); +} + +unsafe fn packed_packed_1x8(f: Option<&str>) { + println!("-- 1x8 kernels"); + kloop!(f, "1x8x1", (8 * 8), "8x8/packed_packed_loop1/avx.tmpli", 8); + kloop!(f, "1x8x2", (8 * 8), "8x8/packed_packed_loop1/avx-unroll.tmpli", 8, 2); + if std::is_x86_feature_detected!("avx512f") { + kloop!(f, "1x8x1", (16 * 1 * 8), "8x8/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512"); + } + println!(); +} + +unsafe fn packed_packed_2x6(f: Option<&str>) { + println!("-- 2x6 kernels"); + kloop!(f, "2x6x1", (16 * 6), "2x6/packed_packed_loop1/original.tmpli", 8); + kloop!(f, "2x6x2", (16 * 6), "2x6/packed_packed_loop1/original-unroll.tmpli", 8, 2); + if std::is_x86_feature_detected!("avx512f") { + kloop!(f, "2x6x1", (16 * 2 * 6), "2x6/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512"); + kloop!(f, "2x6x2", (16 * 2 * 6), "2x6/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512"); + } + println!(); +} + +unsafe fn packed_packed_2x5(f: Option<&str>) { + println!("-- 2x5 kernels"); + kloop!(f, "2x5x1", (16 * 5), "2x5/packed_packed_loop1/avx.tmpli", 8); + kloop!(f, "2x5x2", (16 * 5), "2x5/packed_packed_loop1/avx-unroll.tmpli", 8, 2); + if std::is_x86_feature_detected!("avx512f") { + kloop!(f, "2x5x1", (32 * 5), "2x5/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512"); + kloop!(f, "2x5x2", (32 * 5), "2x5/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512"); + } + println!(); +} + +unsafe fn packed_packed_3x4(f: Option<&str>) { + println!("-- 3x4 kernels"); + kloop!(f, "3x4x1", (24 * 4), "3x4/packed_packed_loop1/avx.tmpli", 8); + kloop!(f, "3x4x2", (24 * 4), "3x4/packed_packed_loop1/avx-unroll.tmpli", 8, 2); + if std::is_x86_feature_detected!("avx512f") { + kloop!(f, "3x4x1", (16 * 3 * 4), "3x4/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512"); + kloop!(f, "3x4x2", (16 * 3 * 4), "3x4/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512"); + } + println!(); +} + +unsafe fn packed_packed_4x3(f: Option<&str>) { + println!("-- 4x3 kernels"); + kloop!(f, "4x3x1", (32 * 3), "4x3/packed_packed_loop1/avx.tmpli", 8); + kloop!(f, "4x3x2", (32 * 3), "4x3/packed_packed_loop1/avx-unroll.tmpli", 8, 2); + if std::is_x86_feature_detected!("avx512f") { + kloop!(f, "4x3x1", (16 * 4 * 3), "4x3/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512"); + kloop!(f, "4x3x2", (16 * 4 * 3), "4x3/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512"); + } + println!(); +} + +unsafe fn packed_packed_5x2(f: Option<&str>) { + println!("-- 5x2 kernels"); + kloop!(f, "5x2x1", (40 * 2), "5x2/packed_packed_loop1/avx.tmpli", 8); + kloop!(f, "5x2x1", (40 * 2), "5x2/packed_packed_loop1/avx-unroll.tmpli", 8, 2); + if std::is_x86_feature_detected!("avx512f") { + kloop!(f, "5x2x1", (16 * 5 * 2), "5x2/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512"); + kloop!(f, "5x2x2", (16 * 5 * 2), "5x2/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512"); + } + println!(); +} + +unsafe fn packed_packed_6x2(f: Option<&str>) { + println!("-- 6x2 kernels"); + kloop!(f, "6x2x1", (48 * 2), "6x2/packed_packed_loop1/avx.tmpli", 8); + kloop!(f, "6x2x2", (48 * 2), "6x2/packed_packed_loop1/avx-unroll.tmpli", 8, 2); + if std::is_x86_feature_detected!("avx512f") { + kloop!(f, "6x2x1", (16 * 6 * 2), "6x2/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512"); + kloop!(f, "6x2x2", (16 * 6 * 2), "6x2/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512"); + } + println!(); +} + +unsafe fn packed_packed_8x2(f: Option<&str>) { + println!("-- 8x2 kernels"); + if std::is_x86_feature_detected!("avx512f") { + kloop!(f, "8x2x1", (16 * 8 * 2), "8x2/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512"); + } + println!(); +} + +unsafe fn packed_packed_8x1(f: Option<&str>) { + println!("-- 8x1 kernels"); + kloop!(f, "8x1x1", (64 * 1), "8x1/packed_packed_loop1/avx.tmpli", 8); + kloop!(f, "8x1x2", (64 * 1), "8x1/packed_packed_loop1/avx-unroll.tmpli", 8, 2); + if std::is_x86_feature_detected!("avx512f") { + kloop!(f, "8x1x1", (16 * 8 * 1), "8x1/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512"); + kloop!(f, "8x1x2", (16 * 8 * 1), "8x1/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512"); + } + println!(); +} + +unsafe fn packed_packed_6x1(f: Option<&str>) { + println!("-- 6x1 kernels"); + kloop!(f, "6x1x1", (48 * 1), "6x1/packed_packed_loop1/avx.tmpli", 8); + kloop!(f, "6x1x2", (48 * 1), "6x1/packed_packed_loop1/avx-unroll.tmpli", 8, 2); + if std::is_x86_feature_detected!("avx512f") { + kloop!(f, "6x1x1", (16 * 6 * 1), "6x1/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512"); + kloop!(f, "6x1x2", (16 * 6 * 1), "6x1/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512"); + } + println!(); +} + +unsafe fn packed_packed_7x1(f: Option<&str>) { + println!("-- 7x1 kernels"); + if std::is_x86_feature_detected!("avx512f") { + kloop!(f, "7x1x1", (16 * 7 * 1), "7x1/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512"); + kloop!(f, "7x1x2", (16 * 7 * 1), "7x1/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512"); + } + println!(); +} + +unsafe fn packed_packed_1x1(f: Option<&str>) { + if std::is_x86_feature_detected!("avx512f") { + kloop!(f, "1x1x1", (16 * 1 * 1), "1x1/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512"); + kloop!(f, "1x1x2", (16 * 1 * 1), "1x1/packed_packed_loop1/unroll.tmpli", 16, 2, "avx512"); + kloop!(f, "1x1x4", (16 * 1 * 1), "1x1/packed_packed_loop1/unroll-4.tmpli", 16, 4, "avx512"); + kloop!(f, "1x1x8", (16 * 1 * 1), "1x1/packed_packed_loop1/unroll-8.tmpli", 16, 8, "avx512"); + kloop!(f, "1x1x16", (16 * 1 * 1), "1x1/packed_packed_loop1/unroll-16.tmpli", 16, 16, "avx512"); + } + println!(); +} + +unsafe fn packed_packed_10x1(f: Option<&str>) { + println!("-- 10x1 kernels"); + kloop!(f, "10x1x1", (80 * 1), "10x1/packed_packed_loop1/avx.tmpli", 8); + kloop!(f, "10x1x2", (80 * 1), "10x1/packed_packed_loop1/avx-unroll.tmpli", 8, 2); + if std::is_x86_feature_detected!("avx512f") { + kloop!(f, "10x1x1", (16 * 10 * 1), "10x1/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512"); + kloop!(f, "10x1x2", (16 * 10 * 1), "10x1/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512"); + } + println!(); +} + +fn main() { + let filter = std::env::args().skip(1).find(|a| a != "--bench"); + unsafe { + packed_packed_1x1(filter.as_deref()); + packed_packed_1x12(filter.as_deref()); + packed_packed_1x8(filter.as_deref()); + packed_packed_2x6(filter.as_deref()); + packed_packed_2x5(filter.as_deref()); + packed_packed_3x4(filter.as_deref()); + packed_packed_4x3(filter.as_deref()); + packed_packed_5x2(filter.as_deref()); + packed_packed_6x2(filter.as_deref()); + packed_packed_8x2(filter.as_deref()); + packed_packed_6x1(filter.as_deref()); + packed_packed_7x1(filter.as_deref()); + packed_packed_8x1(filter.as_deref()); + packed_packed_10x1(filter.as_deref()); + } +} diff --git a/vendor/tract-linalg-0.22.1/build.rs b/vendor/tract-linalg-0.22.1/build.rs new file mode 100644 index 000000000..cc7f4449a --- /dev/null +++ b/vendor/tract-linalg-0.22.1/build.rs @@ -0,0 +1,374 @@ +#![allow(clippy::box_default)] + +use liquid_core::Runtime; +use liquid_core::{Display_filter, Filter, FilterReflection, ParseFilter}; +use liquid_core::{Value, ValueView}; + +use std::{env, ffi, fs, path}; + +#[path = "arm64/apple_amx/instructions.rs"] +mod apple_amx_instructions; + +fn var(k: &str) -> String { + env::var(k).unwrap() +} + +fn use_masm() -> bool { + env::var("CARGO_CFG_TARGET_ENV") == Ok("msvc".to_string()) && var("HOST").contains("-windows-") +} + +fn include_amx() -> bool { + let arch = var("CARGO_CFG_TARGET_ARCH"); + let os = var("CARGO_CFG_TARGET_OS"); + os == "macos" + || (env::var("CARGO_FEATURE_APPLE_AMX_IOS").is_ok() && os == "ios" && arch == "aarch64") +} + +fn jump_table() -> Vec { + println!("cargo:rerun-if-changed=src/frame/mmm/fuse.rs"); + std::fs::read_to_string("src/frame/mmm/fuse.rs") + .unwrap() + .lines() + .filter(|l| l.contains("// jump_to:")) + .map(|l| l.split("jump_to:").nth(1).unwrap().to_owned()) + .collect() +} + +#[derive(Clone, Debug)] +struct ConfigForHalf { + extra_flags: Vec, + needs_pragma: bool, +} + +impl ConfigForHalf { + fn new(extra_flags: Vec, needs_pragma: bool) -> ConfigForHalf { + ConfigForHalf { extra_flags, needs_pragma } + } + + fn all() -> Vec { + let mut configs = vec![]; + for extra_flags in + [vec![], vec!["-march=armv8.2-a".to_string()], vec!["-mcpu=cortex-a55".to_string()]] + { + for needs_pragma in [false, true] { + configs.push(ConfigForHalf::new(extra_flags.clone(), needs_pragma)) + } + } + configs + } + + fn cc(&self) -> cc::Build { + let mut cc = cc::Build::new(); + for flag in &self.extra_flags { + cc.flag(flag); + } + cc + } + + fn works(&self) -> bool { + let filename = if self.needs_pragma { + "arm64/arm64fp16/dummy_fmla_pragma.S" + } else { + "arm64/arm64fp16/dummy_fmla_no_pragma.S" + }; + self.cc().static_flag(true).file(filename).try_compile("dummy").is_ok() + } + + pub fn probe() -> Option { + Self::all().iter().find(|c| c.works()).cloned() + } +} + +fn main() { + let target = var("TARGET"); + let arch = var("CARGO_CFG_TARGET_ARCH"); + let os = var("CARGO_CFG_TARGET_OS"); + let out_dir = path::PathBuf::from(var("OUT_DIR")); + + let suffix = env!("CARGO_PKG_VERSION").replace(['-', '.'], "_"); + make_extern_kernel_decl_macro(&out_dir, &suffix); + + match arch.as_ref() { + "x86_64" => { + let mut files = preprocess_files("x86_64/fma", &[], &suffix, false); + files.extend(preprocess_files("x86_64/avx512", &[], &suffix, false)); + + if os == "windows" { + if use_masm() { + let mut lib_exe = cc::windows_registry::find(&target, "lib.exe") + .expect("Could not find lib.exe"); + lib_exe + .arg(format!("/out:{}", out_dir.join("x86_64_fma.lib").to_str().unwrap())); + for f in files { + let mut obj = f.clone(); + obj.set_extension("o"); + let mut ml_exe = cc::windows_registry::find(&target, "ml64.exe") + .expect("Could not find ml64.exe"); + if !ml_exe + .arg("/Fo") + .arg(&obj) + .arg("/c") + .arg(&f) + .status() + .unwrap() + .success() + { + for (i, l) in std::fs::read_to_string(&f).unwrap().lines().enumerate() { + println!("{i:8} {l}"); + } + panic!(); + } + lib_exe.arg(obj); + } + assert!(lib_exe.status().unwrap().success()); + println!("cargo:rustc-link-search=native={}", out_dir.to_str().unwrap()); + println!("cargo:rustc-link-lib=static=x86_64_fma"); + } else { + cc::Build::new() + .files(files) + .flag("-mfma") + .flag("-mf16c") + .static_flag(true) + .compile("x86_64_fma"); + + // clang at least (dunno about gcc) outputs .asm files in the + // root directory that we need to clean up so we don't pollute + // the build output/working directory + let _ = fs::remove_file("fma_mmm_f32_16x6.asm"); + let _ = fs::remove_file("fma_mmm_i32_8x8.asm"); + let _ = fs::remove_file("fma_sigmoid_f32.asm"); + let _ = fs::remove_file("fma_tanh_f32.asm"); + } + } else { + cc::Build::new().files(files).flag("-mfma").static_flag(true).compile("x86_64_fma"); + } + } + "arm" | "armv7" => { + let files = preprocess_files("arm32/armvfpv2", &[], &suffix, false); + cc::Build::new() + .files(files) + .flag("-marm") + .flag("-mfpu=vfp") + .static_flag(true) + .compile("armvfpv2"); + let files = preprocess_files( + "arm32/armv7neon", + &[("core", vec!["cortexa7", "cortexa9", "generic"])], + &suffix, + false, + ); + cc::Build::new() + .files(files) + .flag("-marm") + .flag("-mfpu=neon") + .static_flag(true) + .compile("armv7neon"); + } + "aarch64" => { + let files = preprocess_files( + "arm64/arm64simd", + &[("core", vec!["a53", "a55", "gen"])], + &suffix, + false, + ); + cc::Build::new().files(files).static_flag(true).compile("arm64simd"); + if include_amx() { + let files = preprocess_files("arm64/apple_amx", &[], &suffix, false); + cc::Build::new().files(files).static_flag(true).compile("appleamx"); + } + if std::env::var("CARGO_FEATURE_NO_FP16").is_err() { + let config = + ConfigForHalf::probe().expect("No configuration found for fp16 support"); + let files = preprocess_files( + "arm64/arm64fp16", + &[("core", vec!["a55", "gen"])], + &suffix, + config.needs_pragma, + ); + config.cc().files(files).static_flag(true).compile("arm64fp16") + } + } + _ => {} + } +} + +type Variant = (&'static str, Vec<&'static str>); + +fn preprocess_files( + input: impl AsRef, + variants: &[Variant], + suffix: &str, + needs_pragma: bool, +) -> Vec { + let out_dir = path::PathBuf::from(var("OUT_DIR")); + let mut files = vec![]; + let dir_entries = { + let mut dir_entries: Vec = + input.as_ref().read_dir().unwrap().map(|f| f.unwrap()).collect(); + dir_entries.sort_by_key(|a| a.path()); + dir_entries + }; + for f in dir_entries { + if f.path().extension() == Some(ffi::OsStr::new("tmpl")) { + let tmpl_file = f.path().file_name().unwrap().to_str().unwrap().to_owned(); + let concerned_variants: Vec<&Variant> = + variants.iter().filter(|v| tmpl_file.contains(v.0)).collect(); + let expanded_variants = concerned_variants.iter().map(|pair| pair.1.len()).product(); + for v in 0..expanded_variants { + let mut tmpl_file = tmpl_file.clone(); + let mut id = v; + let mut globals = vec![]; + for variable in variants { + let key = variable.0; + let value = variable.1[id % variable.1.len()]; + globals.push((key, value)); + tmpl_file = tmpl_file.replace(key, value); + id /= variable.1.len(); + } + let mut file = out_dir.join(tmpl_file); + file.set_extension("S"); + preprocess_file(f.path(), &file, &globals, suffix, needs_pragma); + files.push(file); + } + } + } + files +} + +fn strip_comments(s: String, msvc: bool) -> String { + if msvc { + s.lines().map(|line| line.replace("//", ";")).collect::>().join("\n") + } else { + s + } +} + +fn preprocess_file( + template: impl AsRef, + output: impl AsRef, + variants: &[(&'static str, &'static str)], + suffix: &str, + needs_pragma: bool, +) { + println!("cargo:rerun-if-changed={}", template.as_ref().to_string_lossy()); + let family = var("CARGO_CFG_TARGET_FAMILY"); + let os = var("CARGO_CFG_TARGET_OS"); + + // We also check to see if we're on a windows host, if we aren't, we won't be + // able to use the Microsoft assemblers, + let msvc = use_masm(); + println!("cargo:rerun-if-changed={}", template.as_ref().to_string_lossy()); + let mut input = fs::read_to_string(&template).unwrap(); + input = strip_comments(input, msvc); + let l = if os == "macos" { + "L" + } else if family == "windows" { + "" + } else { + ".L" + } + .to_owned(); + let long = if msvc { "dd" } else { ".long" }; + let g = if os == "macos" || os == "ios" { "_" } else { "" }; + // note: use .align with bytes instead of p2align since they both use direct bytes. + let align = if msvc { "align" } else { ".align" }; + let mut globals = liquid::object!({ + "msvc": msvc, + "needs_pragma": needs_pragma, + "family": family, + "os": os, + "L": l, + "G": g, + "suffix": suffix, + "long": long, + "jump_table": jump_table(), + "align": align, + "offset": if msvc { "offset" } else { "rip + "}, + }); + for (k, v) in variants { + globals.insert(k.to_string().into(), liquid::model::Value::scalar(*v)); + } + let partials = load_partials(template.as_ref().parent().unwrap(), msvc); + let mut parser = liquid::ParserBuilder::with_stdlib() + .partials(liquid::partials::LazyCompiler::new(partials)) + .filter(F16); + if include_amx() { + parser = apple_amx_instructions::register(parser); + globals.extend(apple_amx_instructions::globals()); + } + if let Err(e) = parser + .build() + .and_then(|p| p.parse(&input)) + .and_then(|r| r.render_to(&mut fs::File::create(&output).unwrap(), &globals)) + { + eprintln!("Processing {}", template.as_ref().to_string_lossy()); + eprintln!("{e}"); + panic!() + } +} + +fn load_partials(p: &path::Path, msvc: bool) -> liquid::partials::InMemorySource { + let mut mem = liquid::partials::InMemorySource::new(); + for f in walkdir::WalkDir::new(p) { + let f = f.unwrap(); + if f.path().is_dir() { + continue; + } + + let ext = f.path().extension().map(|s| s.to_string_lossy()).unwrap_or("".into()); + let text = std::fs::read_to_string(f.path()).unwrap_or_else(|_| panic!("file {f:?}")); + let text = match ext.as_ref() { + "tmpli" => Some(text.replace("{{", "{").replace("}}", "}")), + "tmpliq" => Some(text), + _ => None, + }; + if let Some(text) = text { + let text = strip_comments(text, msvc); + let key = + f.path().strip_prefix(p).unwrap().to_str().unwrap().to_owned().replace('\\', "/"); + println!("cargo:rerun-if-changed={}", f.path().to_string_lossy().replace('\\', "/")); + + mem.add(key, text); + } + } + mem +} + +fn make_extern_kernel_decl_macro(out_dir: &path::Path, suffix: &str) { + let macro_decl = r#" + macro_rules! extern_kernel { + (fn $name: ident($($par_name:ident : $par_type: ty ),*) -> $rv: ty) => { + paste! { + unsafe extern "C" { pub fn [<$name _ _suffix>]($(par_name: $par_type),*) -> $rv; } + pub use [<$name _ _suffix>] as $name; + } + } + }"# + .replace("_suffix", suffix); + std::fs::write(out_dir.join("extern_kernel_macro.rs"), macro_decl).unwrap(); +} + +#[derive(Clone, ParseFilter, FilterReflection)] +#[filter( + name = "float16", + description = "Write a float16 constant with the .float16 directive in gcc, or as short in clang", + parsed(F16Filter) +)] +pub struct F16; + +#[derive(Debug, Default, Display_filter)] +#[name = "float16"] +struct F16Filter; + +impl Filter for F16Filter { + fn evaluate( + &self, + input: &dyn ValueView, + _runtime: &dyn Runtime, + ) -> liquid_core::Result { + let input: f32 = input.as_scalar().unwrap().to_float().unwrap() as f32; + let value = half::f16::from_f32(input); + let bits = value.to_bits(); + Ok(format!(".short {bits}").to_value()) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/arm32.rs b/vendor/tract-linalg-0.22.1/src/arm32.rs new file mode 100644 index 000000000..29c0a94a4 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm32.rs @@ -0,0 +1,101 @@ +use std::{env, fs}; +pub mod armv7neon; +mod armvfpv2; +mod cortex_a7; +mod cortex_a9; +use armv7neon::*; + +use crate::frame::element_wise::ElementWiseKer; + +use crate::Ops; + +fn has_neon_cpuinfo() -> std::io::Result { + let cpu_info = fs::read_to_string("/proc/cpuinfo")?; + let neon = cpu_info.split("\n").any(|line| { + line.starts_with("Features") && (line.contains("neon") || line.contains("asimd")) + }); + Ok(neon) +} + +fn cpu_part() -> Option { + fs::read_to_string("/proc/cpuinfo").ok().and_then(|cpuinfo| { + cpuinfo + .lines() + .find(|line| line.starts_with("CPU part")) + .and_then(|s| s.trim().split_whitespace().last()) + .and_then(|s| s.strip_prefix("0x")) + .and_then(|s| usize::from_str_radix(s, 16).ok()) + }) +} + +fn has_neon() -> bool { + if let Ok(v) = env::var("TRACT_CPU_ARM32_NEON") { + return v == "true" || v == "1"; + } + has_neon_cpuinfo().unwrap_or(false) +} + +pub fn plug(ops: &mut Ops) { + if has_neon() { + log::info!("armv7neon activated (smmm, ssigmoid), stanh)"); + armv7neon::plug(ops); + + let cpu = cpu_part().unwrap_or(0); + + fn prefer_8x4(_m: Option, _k: Option, n: Option) -> bool { + n.map(|n| n % 4 == 0 && n % 6 != 0 && n <= 12).unwrap_or(false) + } + + let cost_managed_impls = vec![ + armv7neon_mmm_f32_8x4_cortexa7.mmm(), + armv7neon_mmm_f32_8x6_cortexa7.mmm(), + armv7neon_mmm_f32_8x4_cortexa9.mmm(), + armv7neon_mmm_f32_8x6_cortexa9.mmm(), + armv7neon_mmm_f32_8x4_generic.mmm(), + armv7neon_mmm_f32_8x6_generic.mmm(), + crate::generic::mmm::generic_f32_4x4.mmm(), + ]; + ops.mmv_f32 = match cpu { + 0xc07 => Box::new(|_, _| armv7neon::armv7neon_mmm_f32_32x1_cortexa7.mmm()), + 0xc09 => Box::new(|_, _| armv7neon::armv7neon_mmm_f32_32x1_cortexa9.mmm()), + _ => Box::new(|_, _| armv7neon::armv7neon_mmm_f32_32x1_generic.mmm()), + }; + + ops.mmm_f32 = match cpu { + 0xc07 => { + let model = cortex_a7::model(); + Box::new(move |m, k, n| model.pick(&cost_managed_impls, m, k, n)) + } + 0xc09 => { + let model = cortex_a9::model(); + Box::new(move |m, k, n| model.pick(&cost_managed_impls, m, k, n)) + } + _ => Box::new(|m, k, n| { + if prefer_8x4(m, k, n) { + armv7neon::armv7neon_mmm_f32_8x4_generic.mmm() + } else { + armv7neon::armv7neon_mmm_f32_8x6_generic.mmm() + } + }), + }; + ops.qmmm_i32 = Box::new(|_, _, _| armv7neon::armv7neon_mmm_i32_8x4.mmm()); + ops.qmmv_i32 = Box::new(|_, _| armv7neon::armv7neon_mmm_i32_32x1.mmm()); + ops.sigmoid_f32 = Box::new(|| armv7neon_sigmoid_f32_4n::ew()); + ops.tanh_f32 = Box::new(|| armv7neon_tanh_f32_4n::ew()); + } else { + armvfpv2::plug(ops); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn may_have_neon() { + println!("Has neon ? {:?}", has_neon()); + if let Ok(neon) = env::var("TRACT_CPU_EXPECT_ARM32_NEON") { + assert_eq!(neon == "true", has_neon()); + } + } +} diff --git a/vendor/tract-linalg-0.22.1/src/arm32/armv7neon.rs b/vendor/tract-linalg-0.22.1/src/arm32/armv7neon.rs new file mode 100644 index 000000000..3ca3ac441 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm32/armv7neon.rs @@ -0,0 +1,46 @@ +use crate::frame::mmm::ImplementationQuality::ManuallyOptimized; +use crate::pack::PackedFormat; +use crate::Ops; + +const NEON: fn() -> bool = || crate::arm32::has_neon(); + +MMMExternKernel!(armv7neon_mmm_f32_8x4_cortexa7 ( 8, 4)@(16, 4) where(NEON) quality(ManuallyOptimized)); +MMMExternKernel!(armv7neon_mmm_f32_8x4_cortexa9 ( 8, 4)@(16, 4) where(NEON) quality(ManuallyOptimized)); +MMMExternKernel!(armv7neon_mmm_f32_8x4_generic ( 8, 4)@(16, 4) where(NEON) quality(ManuallyOptimized)); +MMMExternKernel!(armv7neon_mmm_f32_8x6_cortexa7 ( 8, 6)@(16, 4) where(NEON) quality(ManuallyOptimized)); +MMMExternKernel!(armv7neon_mmm_f32_8x6_cortexa9 ( 8, 6)@(16, 4) where(NEON) quality(ManuallyOptimized)); +MMMExternKernel!(armv7neon_mmm_f32_8x6_generic ( 8, 6)@(16, 4) where(NEON) quality(ManuallyOptimized)); +MMMExternKernel!(armv7neon_mmm_f32_8x1_generic ( 8, 1)@(16, 4) where(NEON) quality(ManuallyOptimized)); +MMMExternKernel!(armv7neon_mmm_f32_32x1_cortexa7(32, 1)@(16, 4) where(NEON) quality(ManuallyOptimized)); +MMMExternKernel!(armv7neon_mmm_f32_32x1_cortexa9(32, 1)@(16, 4) where(NEON) quality(ManuallyOptimized)); +MMMExternKernel!(armv7neon_mmm_f32_32x1_generic (32, 1)@(16, 4) where(NEON) quality(ManuallyOptimized)); + +MMMExternKernel!(armv7neon_mmm_i32_8x4(8, 4)@(32, 4) where(NEON) + packing[1] = i8i8 => |k| k.with_packing(PackedFormat::new(DatumType::I8, 8, 32), PackedFormat::new(DatumType::I8, 4, 32)); + quality(ManuallyOptimized) + store(i8) +); + +MMMExternKernel!(armv7neon_mmm_i32_32x1(32, 1)@(32, 4) where(NEON) + packing[1] = i8i8 => |k| k.with_packing(PackedFormat::new(DatumType::I8, 32, 32), PackedFormat::new(DatumType::I8, 1, 4)); + quality(ManuallyOptimized) + store(i8) +); + +pub fn plug(ops: &mut Ops) { + ops.mmm_impls.extend_from_slice(&[ + armv7neon_mmm_f32_8x4_cortexa7.mmm(), + armv7neon_mmm_f32_8x4_cortexa9.mmm(), + armv7neon_mmm_f32_8x4_generic.mmm(), + armv7neon_mmm_f32_8x6_cortexa7.mmm(), + armv7neon_mmm_f32_8x6_cortexa9.mmm(), + armv7neon_mmm_f32_8x6_generic.mmm(), + armv7neon_mmm_f32_8x1_generic.mmm(), + armv7neon_mmm_f32_32x1_cortexa7.mmm(), + armv7neon_mmm_f32_32x1_cortexa9.mmm(), + armv7neon_mmm_f32_32x1_generic.mmm(), + ]); +} + +sigmoid_impl!(f32, armv7neon_sigmoid_f32_4n, 4, 4, crate::arm32::has_neon()); +tanh_impl!(f32, armv7neon_tanh_f32_4n, 4, 4, crate::arm32::has_neon()); diff --git a/vendor/tract-linalg-0.22.1/src/arm32/armvfpv2.rs b/vendor/tract-linalg-0.22.1/src/arm32/armvfpv2.rs new file mode 100644 index 000000000..46e75f774 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm32/armvfpv2.rs @@ -0,0 +1,11 @@ +use crate::frame::mmm::ImplementationQuality::ManuallyOptimized; +use crate::frame::mmm::*; +use crate::Ops; + +MMMExternKernel!(armvfpv2_mmm_f32_4x4(4, 4)@(4, 4) quality(ManuallyOptimized)); + +pub fn plug(ops: &mut Ops) { + log::info!("armvfpv2 activated for smmm"); + ops.mmm_f32 = Box::new(|_, _, _| armvfpv2_mmm_f32_4x4.mmm()); + ops.mmm_impls.push(armvfpv2_mmm_f32_4x4.mmm()); +} diff --git a/vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.rs b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.rs new file mode 100644 index 000000000..f50aabdcc --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.rs @@ -0,0 +1,16 @@ +use crate::frame::mmm::CostModel; + pub fn model() -> CostModel<'static> { + CostModel { + big_product_mkn_threshold: 4193728.0, + big_product_kernel_choice: "armv7neon_mmm_f32_8x6_cortexa7", + kernels: &["armv7neon_mmm_f32_8x4_cortexa7", "armv7neon_mmm_f32_8x4_cortexa9", "armv7neon_mmm_f32_8x4_generic", "armv7neon_mmm_f32_8x6_cortexa7", "armv7neon_mmm_f32_8x6_cortexa9", "armv7neon_mmm_f32_8x6_generic", "generic_f32_4x4"], + mrs: &[4, 8], + nrs: &[4, 6], + feat_norm_mean: &[4.589878771602424, 4.5739692460187005, 4.598167981532298, 13.762015999153403, 1.5038983903420524, 0.749874245472837, 3.465165995975855, 0.8777665995975855, 1.5022635814889336, 0.7570422535211268, 2.482142857142857, 0.8333752515090543], + feat_norm_stddev: &[1.2587312982588519, 1.2603116830524392, 1.2581181647300588, 1.3169322340874257, 1.1192637768418767, 0.43308528195884044, 2.2762097127791114, 0.32755518043295856, 1.1069539235554247, 0.42886977033219037, 1.7067987601825914, 0.37264049924995035], + w1: &[0.06765510141849518, 0.024555781856179237, -0.8821254968643188, -0.004870870150625706, -0.10525479167699814, 0.1827959418296814, 0.1633400171995163, -0.2377464473247528, -0.17880690097808838, 0.19097138941287994, 0.04676022008061409, -0.11329511553049088, 0.4089120030403137, -0.3100685477256775, -0.1652061492204666, -0.19124962389469147, -0.03810987249016762, -0.00785011239349842, 0.09714752435684204, -0.11142419278621674, 0.19261880218982697, -0.2893339991569519, -0.19540216028690338, 0.39759594202041626, -0.00619965186342597, -0.8473111391067505, 0.343344122171402, -0.12575943768024445, 0.029266485944390297, -0.02900734543800354, -0.019343264400959015, 0.08306540548801422, -0.1927606761455536, 0.23312175273895264, 0.2576882541179657, -0.35881471633911133, -0.27300119400024414, -0.2995607852935791, -0.7934547662734985, -0.9349930286407471, -0.011614155024290085, -0.12521372735500336, 0.011371670290827751, 0.05779163911938667, 0.17875070869922638, -0.23169392347335815, -0.09749509394168854, 0.07436174154281616, 0.24035069346427917, -0.1262669861316681, 0.3874961733818054, -0.11149000376462936, 0.03639678284525871, 0.17740628123283386, 0.03768332302570343, -0.20480288565158844, -0.1955408751964569, 0.44144806265830994, 0.3628064692020416, -0.2537013292312622, 0.019405143335461617, 0.06186319515109062, 0.5196826457977295, 0.3010406494140625, 0.04013144597411156, 0.03517461195588112, -0.037290964275598526, 0.009919736534357071, -0.3135205805301666, 0.4654330909252167, 0.46720823645591736, 0.29665476083755493, 0.09099660068750381, -0.7376689314842224, -0.07840575277805328, -0.5192644000053406, 0.019796665757894516, -0.021734869107604027, 0.13953897356987, -0.04154204577207565, 0.10942933708429337, -0.13621817529201508, -0.04218055680394173, 0.09188657253980637, -0.16021296381950378, -0.19393481314182281, 0.3737955689430237, 0.08288388699293137, -0.08280416578054428, -0.13087297976016998, -0.09470323473215103, 0.2779513895511627, 0.03663017228245735, 0.36601993441581726, 0.8102841377258301, 0.6883901953697205, -0.33066609501838684, -0.34960171580314636, 0.923985481262207, 0.5853908061981201, 0.07039576023817062, -0.11843020468950272, -0.06797836720943451, 0.0974433571100235, -0.4707315266132355, 0.37827417254447937, 0.15521520376205444, -0.7403592467308044, -0.25005313754081726, 0.596679151058197, -0.7277861833572388, -0.6915309429168701, -0.0050544412806630135, -0.12311484664678574, 0.04149714484810829, 0.05289606750011444, 0.2448417991399765, -0.47261708974838257, -0.3535511791706085, 0.4614925682544708, 0.9230178594589233, -0.5351396799087524, 0.8224894404411316, 0.37244901061058044, -0.08826857805252075, -0.0452042818069458, 0.0035054143518209457, 0.09203510731458664, 0.08918709307909012, -0.0694250762462616, -0.053435735404491425, 0.1012222170829773, 0.3401939570903778, -0.38458573818206787, 0.3040490746498108, 0.7614821791648865, -0.17064380645751953, 0.22403603792190552, 0.08646601438522339, -0.08289062976837158, -0.20126193761825562, 0.2795524299144745, 0.13253425061702728, -0.07332615554332733, 0.2151418924331665, 0.16798575222492218, 0.003749655559659004, 0.2437056005001068, -0.09098415076732635, 0.18923071026802063, 0.07854695618152618, -0.25417080521583557, 0.15693743526935577, -0.30657434463500977, -0.19041943550109863, 0.26519766449928284, 0.24278832972049713, -0.18357035517692566, -0.015992645174264908, 0.43973660469055176, 0.02785446122288704, 0.3032245934009552, -0.021606506779789925, -0.2682349383831024, -0.10395143181085587, 0.050348248332738876, 0.12892353534698486, -0.10498340427875519, -0.027477847412228584, 0.09730125963687897, -0.16150422394275665, -0.21831916272640228, 0.10376061499118805, -0.25544440746307373, 0.031593386083841324, 0.11986788362264633, 0.22690074145793915, -0.3509098291397095, -0.1881190538406372, -0.04210145026445389, 0.6883101463317871, -0.07829979062080383, 0.4657376706600189, 0.9263871908187866, 0.08322961628437042, 0.04429711028933525, -0.08905605971813202, -0.06788893789052963, -0.056182388216257095, -0.04881853610277176, -0.04854113608598709, 0.15449045598506927, 0.32911357283592224, -0.5772383809089661, -0.00027374469209462404, -0.2995521128177643, -0.027322502806782722, 0.5023694038391113, 0.045783523470163345, -0.4035968780517578, 0.053967904299497604, 0.00014662329340353608, 0.021607715636491776, -0.028252260759472847, -0.05918470770120621, -0.1273883581161499, 0.0679078996181488, 0.25051605701446533, -0.0745333656668663, 0.18680104613304138, -0.12048312276601791, 0.013110226020216942, -0.07659415900707245, 0.2906968295574188, 0.3136366307735443, -0.47699007391929626, 0.02583535574376583, -0.15701107680797577, 0.045304182916879654, 0.23456838726997375, -0.06186807528138161, 0.3926846981048584, -0.13252438604831696, -0.16362214088439941, 0.013557562604546547, -0.09991434961557388, 0.09150815010070801, -0.006477471441030502, 0.2915862202644348, 0.5867642164230347, -0.37984445691108704, 0.033169880509376526, 0.024414243176579475, -0.0384003147482872, -0.06395144015550613, 0.07380940765142441, -0.025898484513163567, 0.03951931372284889, -0.2343142330646515, 0.27318838238716125, 0.1105947494506836, 0.290696382522583, -0.17851489782333374, -0.17699271440505981, -0.210996612906456, -0.10575137287378311, 0.15886521339416504, 0.10631759464740753, 0.22946283221244812, -0.3170112073421478, -0.49773311614990234, -0.10753292590379715, -0.1114523783326149, -0.10953730344772339, 0.4754663109779358, 0.20793643593788147, 0.021392812952399254, -0.0691467821598053, 0.03368104621767998, -0.017844771966338158, 0.1657843142747879, -0.5556477904319763, -1.108074426651001, -0.822117805480957, -0.06053074076771736, -0.4072379469871521, 0.09109722077846527, -0.5544739961624146, -0.13978064060211182, -0.36262163519859314, 0.20034632086753845, 0.050625383853912354, 0.1497042030096054, -0.18745489418506622, 0.0894727036356926, 0.00417149206623435, 0.2228451371192932, 0.00852279644459486, -0.028313757851719856, 0.04104698821902275, -0.0874263271689415, 0.19788521528244019, -0.019343160092830658, -0.03962515667080879, 0.2092486023902893, -0.44425246119499207, -0.48542261123657227, -0.04222029820084572, 0.7616084218025208, 0.512810468673706, -0.17871123552322388, 0.5459727644920349, -0.13069608807563782, 0.09155352413654327, 0.11548610031604767, -0.15368784964084625, 0.038799818605184555, -0.049028217792510986, -0.03215758875012398, -0.050522346049547195, 0.1663637012243271, -0.15482299029827118, -0.9425870180130005, -0.7017998695373535, 0.04315050691366196, -0.019968662410974503, 0.03749818727374077, -0.07611791789531708, 0.32011789083480835, -0.6925904750823975, -0.49334919452667236, 0.23214411735534668, 1.1447347402572632, -0.6757001876831055, 0.7940422296524048, 0.40169182419776917, -0.018513813614845276, 0.048821814358234406, -0.016693273559212685, 0.008068449795246124, 0.04566117003560066, -0.09829569607973099, -0.026971371844410896, 0.05381541699171066, -0.3659301698207855, 0.3473235070705414, 0.14521746337413788, 0.11228122562170029, -0.041056130081415176, -0.11228874325752258, 0.006667478010058403, 0.15931302309036255, -0.30010080337524414, 0.3464723229408264, 0.4476386308670044, -0.3498152494430542, 0.2616507112979889, -0.19995814561843872, 0.10946320742368698, 0.4034257233142853, -0.08651446551084518, 0.018647747114300728, 0.11572548002004623, -0.100877545773983, -0.16341210901737213, 0.2377898246049881, 0.3417612910270691, -0.49084869027137756, -0.02805873565375805, -0.09811390936374664, 0.17161016166210175, 0.3627470135688782, -0.08954513072967529, 0.06629404425621033, 0.012786897830665112, 0.01578289456665516, -0.32630467414855957, 0.4854920506477356, 0.12709765136241913, -0.4909423291683197, -0.3745254874229431, -0.6513142585754395, -0.040075208991765976, -0.569782018661499, -0.009953420609235764, 0.04735071584582329, 0.0230120699852705, -0.07381311058998108, -0.06293600797653198, 0.20196016132831573, 0.26551517844200134, -0.42071688175201416, 0.28809165954589844, 0.19747501611709595, -0.5686206221580505, -0.5285986661911011, 0.02009684592485428, 0.11322621256113052, -0.1082596555352211, -0.0856761634349823, -0.04493662342429161, -0.6179490089416504, -0.1442672610282898, 0.028762176632881165, 0.12426868081092834, -0.5771384835243225, 0.1608373522758484, 0.004147801548242569, -0.047590240836143494, 0.10347189754247665, 0.11780986934900284, -0.08490656316280365, -0.0746934711933136, 0.15699702501296997, 0.1298881322145462, -0.14411042630672455, -0.08601037412881851, 0.2997709810733795, -0.05418943241238594, -0.1772651970386505, 0.04576871916651726, -0.13510753214359283, -0.057203926146030426, 0.18647770583629608, 0.0055348677560687065, -0.12238732725381851, -0.11199415475130081, 0.43077343702316284, 0.1349855363368988, 0.21327465772628784, 0.05924845486879349, 0.12549948692321777, -0.060076650232076645, 0.23921678960323334, 0.02152605727314949, -0.1352948695421219, 0.09325127303600311, -0.14411674439907074, 0.010495728813111782, 0.11577513813972473, -0.07580242305994034, 0.42641204595565796, -0.5557231903076172, -0.12044595927000046, 0.024152765050530434, -0.14175696671009064, 0.024960221722722054, 0.10017693042755127, -0.07402117550373077, 0.09156208485364914, 0.455565482378006, 0.424320250749588, -0.07668061554431915, 0.10318724811077118, -0.32521969079971313, -0.2653461694717407, -0.03919212520122528, 0.12909358739852905, -0.17091549932956696, 0.07353391498327255, 0.11510979384183884, -0.23758216202259064, -0.3059186339378357, -0.046047650277614594, 0.17527209222316742, 0.19020265340805054, -0.20766229927539825, -0.23476286232471466, -0.14011070132255554, 0.1085173636674881, -0.020777594298124313, 0.014691418968141079, 0.21648286283016205, -0.21576255559921265, 0.28203028440475464, 0.6320008635520935, -0.23609709739685059, 0.16072526574134827, 0.30149686336517334, -0.05675647035241127, -0.018186205998063087, -0.1844293773174286, 0.13510139286518097, 0.05780869722366333, 0.07202577590942383, 0.07459436357021332, 0.18700383603572845, -0.09449177235364914, 0.057188909500837326, 0.21453143656253815, -0.30002379417419434, -0.12217795103788376, 0.03723505884408951, -0.18360234797000885, -0.029992947354912758, 0.10999765247106552, 0.09575961530208588, -0.36028456687927246, -0.4311397075653076, 0.5812231302261353], + b1: &[0.3801889419555664, -0.5001883506774902, 0.19484910368919373, 0.6488791704177856, 0.38620173931121826, 0.8780303597450256, -0.1126403734087944, 0.021730314940214157, -0.7806469202041626, -0.04312174394726753, 0.3102167546749115, 0.9241658449172974, 0.8900863528251648, -0.2938256561756134, -0.5012822151184082, -0.00329477502964437, 0.5169500708580017, 0.4563848376274109, -0.4903448224067688, 0.27919942140579224, -0.4288303554058075, -0.1836952418088913, -0.09118890762329102, 0.5528226494789124, -0.19896377623081207, 0.33588215708732605, 0.07895006239414215, 0.07812929153442383, 0.6203332543373108, 0.8427650332450867, -0.684628427028656, 0.5408275723457336, -0.5548633933067322, -0.49557214975357056, 0.7953769564628601, -0.4109633266925812, -0.6270897388458252, -0.43285393714904785, -0.7562689781188965, -0.7167727947235107], + w2: &[0.15592391788959503, 0.25119924545288086, -0.499594122171402, -0.5441639423370361, -0.11186911165714264, -0.6334478855133057, 0.28880706429481506, -0.592946469783783, 0.7188563942909241, -0.49322614073753357, -0.1398385912179947, -0.1868145614862442, 0.9288992881774902, -0.07525540888309479, 0.2288437783718109, 0.09932874143123627, 0.2782813012599945, -0.12644614279270172, -0.14151062071323395, 0.38845404982566833, 0.2691279947757721, -0.9148958921432495, 0.19230225682258606, 0.6098687052726746, -0.24782557785511017, -0.6989489197731018, -0.30721813440322876, -0.4890380799770355, -0.43724432587623596, -0.38428765535354614, -0.6491377353668213, -0.28134995698928833, -0.36228886246681213, -0.05963568389415741, 0.5086851119995117, 0.4664144814014435, 0.3797634541988373, 0.5596290826797485, -0.1977449357509613, 0.6540879607200623, -0.24533972144126892, 0.6865915656089783, -0.18364377319812775, 0.0013501447392627597, -0.4037604331970215, -0.287411093711853, -0.43570032715797424, -0.4085054099559784, 0.7341827750205994, -0.29973891377449036, -0.18240050971508026, -0.23446109890937805, 0.7225431799888611, 0.008502814918756485, 0.04582007974386215, 0.03352205455303192, 0.12457727640867233, -0.2019437849521637, -0.1299249827861786, -0.09946829080581665, 0.40665051341056824, -0.6841736435890198, -0.523845911026001, 0.21656402945518494, 0.6046024560928345, -0.6393186450004578, -0.3965637981891632, -0.7872777581214905, -0.13687947392463684, -0.19312888383865356, -0.5453231930732727, -0.21912647783756256, 0.011589044705033302, 0.2665385603904724, 0.3249806761741638, 0.293254017829895, 0.1047254130244255, 0.4246895909309387, -0.0033608688972890377, 0.4066942632198334, 0.06138676777482033, 0.382074236869812, 0.0787188857793808, -0.28631800413131714, -0.3500039279460907, -0.1490340679883957, -0.14991725981235504, -0.180477574467659, 0.15140952169895172, -0.35168370604515076, 0.38904908299446106, -0.11262823641300201, -0.18404939770698547, 0.5045862197875977, 0.23344825208187103, 0.6740546226501465, -0.054060351103544235, -0.47260594367980957, 0.287933886051178, 0.28975099325180054, 0.2366262525320053, -0.1751112937927246, -0.15358465909957886, -0.062381260097026825, 0.45881521701812744, -0.12647950649261475, 0.45258036255836487, -0.21084383130073547, -0.15994171798229218, -0.4229416847229004, -0.18642400205135345, -0.2506699860095978, 0.20604389905929565, 0.16662882268428802, -0.23073841631412506, 0.045810505747795105, 0.33520498871803284, 0.37685254216194153, 0.11563336104154587, 0.22259201109409332, -0.010484708473086357, -0.45855188369750977, 0.24794596433639526, 0.33667632937431335, 0.20378778874874115, 0.4198003113269806, 0.23384596407413483, 0.23601709306240082, -0.509751558303833, 0.5694931149482727, -0.08933047205209732, 0.037133198231458664, 0.20635388791561127, -0.2857131361961365, -0.4278101921081543, -0.26602792739868164, 0.1998632550239563, 0.4324374794960022, -0.13389578461647034, 0.11837134510278702, -0.17028754949569702, 0.37928706407546997, 0.10062910616397858, -0.04736608266830444, -0.04692180082201958, 0.6633663773536682, -0.3517492711544037, 0.2055688351392746, 0.44142597913742065, 0.42460545897483826, 0.4567111134529114, 0.3061029016971588, -0.16390416026115417, -0.3541538417339325, 0.2544074058532715, -0.18162837624549866, -0.21904821693897247, -0.2520917057991028, -0.07266020774841309, -0.23432950675487518, -0.1989256739616394, 0.09460597485303879, -0.24563294649124146, 0.9719013571739197, 0.2578149139881134, 0.26680076122283936, -0.39480605721473694, 0.22382304072380066, -0.4284250736236572, 0.4294125437736511, -0.04923247918486595, 0.5011574625968933, 0.1887599676847458, -0.02984841726720333, -0.16428305208683014, -0.33957910537719727, -0.16184143722057343, 0.37313663959503174, -0.11775537580251694, -0.34507161378860474, -0.24848994612693787, 0.3492432236671448, -0.2122095823287964, -0.022055158391594887, 0.07298140972852707, 0.36230477690696716, -0.2514148950576782, 0.11675992608070374, 0.4010731875896454, 0.31790846586227417, 0.0585796944797039, 0.30878275632858276, 0.5536429286003113, -0.061644136905670166, -0.06381722539663315, -0.1873038411140442, -0.24746698141098022, -0.3139619529247284, -0.19278131425380707, -0.48264867067337036, 0.5122742056846619, 0.09536745399236679, 0.17870695888996124, 0.18145892024040222, 0.2471739798784256, -0.16399677097797394, -0.18874068558216095, 0.21305255591869354, -0.6930050253868103, -0.4031701982021332, 0.5250658392906189, 0.4295860230922699, -0.464653879404068, -0.026941847056150436, -0.08213993161916733, 0.34638163447380066, -0.15401627123355865, 0.021148433908820152, 0.19726167619228363, -0.25100240111351013, 3.085673233726993e-05, 0.16563303768634796, -0.008333534933626652, -0.02890022285282612, -0.284770667552948, 0.3429299592971802, 0.6073935627937317, -0.10915102809667587, 0.3420248329639435, 0.07347360253334045, 0.18400518596172333, 0.2084905058145523, 0.3218590021133423, 0.16883575916290283, -0.6880696415901184, -0.37455135583877563, 0.04792584478855133, -0.04572531208395958, -0.17001567780971527, -0.12369263172149658, -0.3716808259487152, -0.04167286679148674, 0.04307235777378082, -0.1655367612838745, -0.47902533411979675, -0.21886907517910004, 0.4065888226032257, 0.30626556277275085, 0.25965678691864014, 0.07168732583522797, -0.17138782143592834, -0.6293558478355408, -0.6350710988044739, 0.25923609733581543, 0.5668261647224426, -0.030662082135677338, -0.7059182524681091, -0.25901535153388977, 0.25449642539024353, -0.3232290744781494, 0.42758384346961975, 0.7120643258094788, 0.023215001448988914, -0.40807682275772095, 0.1332295536994934, -0.33705568313598633, 0.1038941740989685, 0.39904412627220154, -0.567590057849884, -0.26575762033462524, 0.7635160088539124, -0.38967835903167725, -0.08988548815250397, 0.4150312840938568, -0.540441632270813, 0.33467426896095276, -0.03507159277796745, 0.00720902718603611, 0.6702240109443665, 0.2707512676715851], + b2: &[0.3580038547515869, 0.06861710548400879, -0.04651366174221039, 0.24638813734054565, 0.1557426154613495, -0.40271297097206116, -0.405432790517807], + } + } diff --git a/vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.txt b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.txt new file mode 100644 index 000000000..8cfc6928a --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.txt @@ -0,0 +1,1701 @@ +armv7neon_mmm_f32_8x4_cortexa7 16 128 8 0.000019373978862224142 +armv7neon_mmm_f32_8x6_generic 24 4 18 0.000005589467629481233 +armv7neon_mmm_f32_8x6_cortexa7 24 32 5 0.0000068541067994687505 +armv7neon_mmm_f32_8x6_cortexa7 24 128 12 0.000038981217414944424 +generic_f32_4x4 4 32 3 0.000001790047741390679 +armv7neon_mmm_f32_8x4_generic 17 32 12 0.000013250293625322834 +armv7neon_mmm_f32_8x4_cortexa7 9 4 9 0.00000392359813080775 +armv7neon_mmm_f32_8x6_generic 16 128 18 0.00003780372998360008 +armv7neon_mmm_f32_8x6_generic 7 32 7 0.000004508038841829227 +armv7neon_mmm_f32_8x4_cortexa9 25 128 4 0.00001979557034383245 +armv7neon_mmm_f32_8x4_cortexa7 24 128 5 0.00002916219359816615 +generic_f32_4x4 8 4 7 0.000002792935337190097 +armv7neon_mmm_f32_8x6_generic 25 32 5 0.000008513973930688824 +armv7neon_mmm_f32_8x4_generic 8 128 4 0.000004984706018353683 +armv7neon_mmm_f32_8x6_cortexa7 23 4 6 0.0000024972411854271217 +armv7neon_mmm_f32_8x6_cortexa9 17 32 18 0.000018034380706056615 +generic_f32_4x4 9 32 5 0.000007787097193216308 +armv7neon_mmm_f32_8x4_cortexa7 7 32 11 0.000005445588694072235 +armv7neon_mmm_f32_8x4_cortexa9 7 4 3 0.0000011467255960994079 +armv7neon_mmm_f32_8x6_cortexa7 7 128 6 0.0000071177868474168225 +armv7neon_mmm_f32_8x6_generic 15 128 5 0.000013291532044598022 +armv7neon_mmm_f32_8x6_cortexa7 25 4 18 0.000007704060547781454 +armv7neon_mmm_f32_8x4_generic 15 128 11 0.000028366457109510148 +armv7neon_mmm_f32_8x6_cortexa7 7 4 12 0.000002031241642569242 +generic_f32_4x4 12 32 9 0.000011111678190760999 +armv7neon_mmm_f32_8x6_cortexa9 24 32 17 0.000018402463967012353 +armv7neon_mmm_f32_8x4_cortexa7 8 4 12 0.0000019146048075747953 +armv7neon_mmm_f32_8x4_cortexa7 23 4 8 0.00000383750508652125 +armv7neon_mmm_f32_8x4_cortexa7 25 4 7 0.0000052043882186278224 +armv7neon_mmm_f32_8x4_generic 16 32 12 0.000008739868831484192 +generic_f32_4x4 12 4 8 0.00000347003352464419 +armv7neon_mmm_f32_8x6_cortexa7 25 32 19 0.00003152139986100901 +armv7neon_mmm_f32_8x6_cortexa9 25 32 18 0.000023769189598361993 +armv7neon_mmm_f32_8x4_cortexa7 15 128 5 0.00001982738009364039 +armv7neon_mmm_f32_8x6_generic 9 128 11 0.000025754948633914126 +armv7neon_mmm_f32_8x4_cortexa7 15 4 4 0.0000016751657126978973 +armv7neon_mmm_f32_8x4_cortexa9 24 32 8 0.000009220363478862854 +generic_f32_4x4 3 32 13 0.000005627827228507873 +armv7neon_mmm_f32_8x6_cortexa9 23 128 13 0.00006012344410992782 +armv7neon_mmm_f32_8x6_cortexa7 24 128 6 0.00001973568156590405 +armv7neon_mmm_f32_8x4_generic 17 128 8 0.000027722972081041346 +armv7neon_mmm_f32_8x4_generic 9 128 4 0.000009605985891246275 +armv7neon_mmm_f32_8x4_cortexa7 15 32 3 0.000003754036592781278 +armv7neon_mmm_f32_8x4_cortexa7 25 128 8 0.00003835708390551408 +armv7neon_mmm_f32_8x4_cortexa9 9 32 4 0.0000035373293731924376 +generic_f32_4x4 7 4 11 0.000004044788183106201 +armv7neon_mmm_f32_8x6_cortexa9 7 32 12 0.000004744792047894666 +armv7neon_mmm_f32_8x4_cortexa9 23 4 8 0.0000038749254387558 +generic_f32_4x4 9 32 8 0.0000075635927841880176 +armv7neon_mmm_f32_8x4_cortexa7 9 4 12 0.000003727778947810718 +armv7neon_mmm_f32_8x6_cortexa7 23 32 5 0.0000067956400405037536 +generic_f32_4x4 8 4 5 0.000002723326536848814 +armv7neon_mmm_f32_8x6_cortexa9 15 32 5 0.000004690163698982028 +armv7neon_mmm_f32_8x4_cortexa9 9 32 3 0.0000037317693401172363 +armv7neon_mmm_f32_8x6_cortexa7 25 128 13 0.00007831443093305969 +armv7neon_mmm_f32_8x4_cortexa7 24 32 12 0.00001331331475399187 +armv7neon_mmm_f32_8x6_cortexa7 24 128 19 0.00007802209851667727 +armv7neon_mmm_f32_8x4_generic 16 128 11 0.00002788579513287149 +armv7neon_mmm_f32_8x4_cortexa7 17 128 4 0.000014706021218529391 +armv7neon_mmm_f32_8x6_generic 8 4 13 0.0000023378593097707485 +armv7neon_mmm_f32_8x6_generic 7 4 7 0.0000019186954240278283 +armv7neon_mmm_f32_8x4_cortexa9 16 4 8 0.000002428666090877155 +armv7neon_mmm_f32_8x6_generic 24 128 13 0.00005734829975884668 +generic_f32_4x4 8 128 11 0.000021025833076703056 +armv7neon_mmm_f32_8x6_cortexa9 17 4 19 0.00000803059286526004 +armv7neon_mmm_f32_8x6_cortexa7 25 4 7 0.00000571019568400475 +armv7neon_mmm_f32_8x4_cortexa9 7 128 3 0.000005476007735837036 +generic_f32_4x4 9 32 9 0.000011314156862861827 +armv7neon_mmm_f32_8x4_cortexa9 8 32 3 0.000002175330632998311 +armv7neon_mmm_f32_8x4_cortexa7 9 128 13 0.00003871137553728846 +armv7neon_mmm_f32_8x6_cortexa7 16 32 19 0.00001594300556874623 +armv7neon_mmm_f32_8x4_cortexa7 17 4 13 0.000007095518187604073 +armv7neon_mmm_f32_8x4_generic 17 4 12 0.00000511222434537087 +armv7neon_mmm_f32_8x4_generic 23 32 3 0.000005240662819589997 +armv7neon_mmm_f32_8x6_generic 7 4 19 0.000003459698540347226 +armv7neon_mmm_f32_8x6_cortexa9 7 128 19 0.000027297846707037145 +armv7neon_mmm_f32_8x4_cortexa7 23 4 9 0.000005799914306902319 +armv7neon_mmm_f32_8x4_generic 23 4 4 0.0000021622122978664557 +armv7neon_mmm_f32_8x4_cortexa9 17 128 12 0.00004402054920694197 +armv7neon_mmm_f32_8x6_cortexa9 15 4 18 0.000004616359494672357 +armv7neon_mmm_f32_8x4_cortexa9 8 4 4 0.0000010041412858292361 +armv7neon_mmm_f32_8x4_cortexa9 8 32 13 0.000006496433067761439 +generic_f32_4x4 11 128 3 0.000011040772633263298 +armv7neon_mmm_f32_8x6_generic 24 4 11 0.000004523362892147976 +generic_f32_4x4 9 4 4 0.0000021034125372056796 +generic_f32_4x4 8 4 9 0.000003722094027081023 +generic_f32_4x4 4 4 8 0.0000014952745701485973 +armv7neon_mmm_f32_8x4_generic 16 128 3 0.000009908089584838082 +armv7neon_mmm_f32_8x4_generic 25 32 7 0.000012421086220512836 +armv7neon_mmm_f32_8x6_cortexa7 8 4 5 0.0000013403828133224992 +armv7neon_mmm_f32_8x4_cortexa7 23 32 4 0.00000503241852364444 +armv7neon_mmm_f32_8x6_cortexa9 7 32 6 0.000002618507739501653 +armv7neon_mmm_f32_8x4_cortexa7 7 4 13 0.0000031793808774197963 +armv7neon_mmm_f32_8x4_generic 16 4 7 0.00000282397574390546 +armv7neon_mmm_f32_8x4_cortexa9 17 128 4 0.00001500697741030371 +armv7neon_mmm_f32_8x6_cortexa7 24 4 6 0.000002233395192000468 +armv7neon_mmm_f32_8x6_cortexa9 15 128 6 0.0000138021942440989 +armv7neon_mmm_f32_8x4_cortexa9 15 128 11 0.000030164013207782058 +generic_f32_4x4 7 4 5 0.0000028577946255424533 +armv7neon_mmm_f32_8x4_cortexa9 16 128 5 0.000020020455876347578 +armv7neon_mmm_f32_8x6_cortexa9 16 128 6 0.000013512716817189273 +armv7neon_mmm_f32_8x4_cortexa7 16 4 11 0.0000037996360400635168 +armv7neon_mmm_f32_8x4_generic 7 32 4 0.0000021000959358733572 +armv7neon_mmm_f32_8x4_cortexa9 9 4 11 0.000004001256873073548 +armv7neon_mmm_f32_8x6_generic 24 128 7 0.00003837839936999645 +armv7neon_mmm_f32_8x4_cortexa9 15 128 4 0.00001034155567067011 +armv7neon_mmm_f32_8x4_generic 7 128 9 0.000014585897754194483 +armv7neon_mmm_f32_8x4_cortexa9 7 128 11 0.000015558039194528556 +armv7neon_mmm_f32_8x4_generic 25 32 11 0.000018056248294844393 +armv7neon_mmm_f32_8x6_cortexa7 9 32 6 0.0000044289694676171784 +armv7neon_mmm_f32_8x4_cortexa9 9 32 7 0.000006775203907440964 +armv7neon_mmm_f32_8x4_cortexa7 17 32 4 0.000004906536483038752 +armv7neon_mmm_f32_8x6_generic 25 128 13 0.00007697771420855431 +armv7neon_mmm_f32_8x6_generic 15 32 13 0.000012236156128465641 +armv7neon_mmm_f32_8x4_generic 9 32 11 0.000009363916532723868 +armv7neon_mmm_f32_8x6_cortexa9 16 32 7 0.000008475723903784643 +armv7neon_mmm_f32_8x4_cortexa9 24 4 8 0.000003400276973989579 +armv7neon_mmm_f32_8x4_generic 8 32 9 0.000004816590922050574 +armv7neon_mmm_f32_8x6_cortexa9 7 32 18 0.0000068452889719178265 +armv7neon_mmm_f32_8x4_cortexa9 9 32 5 0.000006726253331965845 +generic_f32_4x4 11 32 13 0.000015033889948226767 +armv7neon_mmm_f32_8x4_generic 17 4 11 0.000005556840437336695 +generic_f32_4x4 13 32 7 0.000010313627379766583 +generic_f32_4x4 9 4 3 0.0000023801388628702307 +armv7neon_mmm_f32_8x4_generic 8 128 3 0.00000520201390212873 +armv7neon_mmm_f32_8x4_generic 8 32 12 0.000004612565409735615 +armv7neon_mmm_f32_8x4_cortexa9 16 128 4 0.000010106142800191435 +armv7neon_mmm_f32_8x4_cortexa7 8 4 5 0.0000016424950539860917 +armv7neon_mmm_f32_8x4_generic 8 32 8 0.000003246701797046072 +armv7neon_mmm_f32_8x6_cortexa7 17 128 7 0.00003957490867985703 +armv7neon_mmm_f32_8x4_cortexa7 7 32 12 0.000005449789255433999 +armv7neon_mmm_f32_8x4_cortexa7 15 32 5 0.000006774977020301673 +armv7neon_mmm_f32_8x6_generic 7 4 13 0.0000027111208216897315 +armv7neon_mmm_f32_8x4_cortexa7 16 32 5 0.000006561637517767873 +generic_f32_4x4 3 128 9 0.000011017761032513612 +armv7neon_mmm_f32_8x4_cortexa7 23 128 5 0.00002948816149020999 +armv7neon_mmm_f32_8x4_cortexa7 17 32 11 0.00001417097516045665 +armv7neon_mmm_f32_8x6_cortexa7 16 4 12 0.000002812978829286788 +armv7neon_mmm_f32_8x6_cortexa9 23 32 5 0.0000068076361854689625 +armv7neon_mmm_f32_8x4_cortexa7 24 4 3 0.0000025622772317463136 +armv7neon_mmm_f32_8x4_cortexa9 16 32 13 0.000012465423813216206 +generic_f32_4x4 4 32 11 0.000004083209759238612 +armv7neon_mmm_f32_8x4_generic 7 32 5 0.000003626955080135103 +armv7neon_mmm_f32_8x6_cortexa7 23 128 6 0.000020078583004924116 +armv7neon_mmm_f32_8x6_cortexa9 25 128 19 0.00010584384897256878 +armv7neon_mmm_f32_8x6_generic 17 4 12 0.000004107213127611975 +armv7neon_mmm_f32_8x6_cortexa7 16 4 17 0.000004432708536915881 +armv7neon_mmm_f32_8x6_cortexa7 16 32 18 0.00001183035782221589 +armv7neon_mmm_f32_8x4_cortexa9 25 4 3 0.000003166221225598654 +armv7neon_mmm_f32_8x4_cortexa9 24 128 9 0.00004426766902863704 +armv7neon_mmm_f32_8x6_generic 25 4 18 0.000007554578118985422 +armv7neon_mmm_f32_8x6_cortexa7 15 32 5 0.000004708546886703234 +armv7neon_mmm_f32_8x6_cortexa9 9 128 6 0.000013657373073808522 +armv7neon_mmm_f32_8x6_cortexa7 25 32 7 0.00001623345709716141 +armv7neon_mmm_f32_8x4_cortexa7 9 32 5 0.000006618675517469529 +armv7neon_mmm_f32_8x4_cortexa7 7 32 13 0.000007030082349548267 +armv7neon_mmm_f32_8x4_cortexa9 17 4 7 0.0000040753817592878874 +armv7neon_mmm_f32_8x6_generic 15 4 5 0.0000019991656588201214 +generic_f32_4x4 12 128 5 0.00002108090993489815 +armv7neon_mmm_f32_8x6_cortexa9 25 128 17 0.00007984886358906369 +generic_f32_4x4 8 128 5 0.000014230862544269676 +armv7neon_mmm_f32_8x6_generic 24 128 18 0.000056743692014886694 +armv7neon_mmm_f32_8x4_cortexa9 8 32 8 0.0000034133387591211906 +armv7neon_mmm_f32_8x4_generic 9 128 12 0.000027800478110813017 +generic_f32_4x4 7 32 7 0.000005509675967097076 +armv7neon_mmm_f32_8x6_cortexa9 16 32 13 0.000012310372306220252 +armv7neon_mmm_f32_8x4_generic 15 128 13 0.00003762533200230424 +generic_f32_4x4 4 128 5 0.000007369710897637148 +armv7neon_mmm_f32_8x6_generic 16 128 6 0.000012947918354537283 +armv7neon_mmm_f32_8x4_cortexa7 8 32 3 0.000002138296476186818 +armv7neon_mmm_f32_8x4_generic 9 128 11 0.000028058363419186727 +armv7neon_mmm_f32_8x4_cortexa7 7 128 13 0.00002008308819140427 +armv7neon_mmm_f32_8x6_cortexa9 15 128 12 0.000027068411466415602 +armv7neon_mmm_f32_8x6_generic 8 128 12 0.000012940488826817833 +armv7neon_mmm_f32_8x4_generic 24 32 7 0.00000940385598649166 +armv7neon_mmm_f32_8x6_cortexa7 15 128 19 0.00005293187157460067 +armv7neon_mmm_f32_8x4_cortexa9 25 128 3 0.000020446918539621497 +armv7neon_mmm_f32_8x4_cortexa7 17 4 9 0.000005534537134563046 +armv7neon_mmm_f32_8x6_cortexa9 15 128 19 0.00005366369135659099 +armv7neon_mmm_f32_8x4_generic 25 4 4 0.000002522928321866339 +armv7neon_mmm_f32_8x6_generic 16 4 19 0.000005280876227150452 +armv7neon_mmm_f32_8x6_generic 23 128 12 0.000038470533759474454 +armv7neon_mmm_f32_8x4_generic 16 32 3 0.00000367665371967417 +armv7neon_mmm_f32_8x6_cortexa7 7 32 12 0.000004702345831209106 +armv7neon_mmm_f32_8x6_generic 16 32 11 0.000008300619599261807 +armv7neon_mmm_f32_8x4_cortexa9 23 4 5 0.000004136206632272795 +armv7neon_mmm_f32_8x4_cortexa9 23 32 7 0.000010130368574328868 +armv7neon_mmm_f32_8x4_generic 7 128 12 0.000014642209408823472 +armv7neon_mmm_f32_8x4_generic 16 4 3 0.0000018668229452058654 +armv7neon_mmm_f32_8x4_cortexa9 24 128 8 0.00002933769331717579 +armv7neon_mmm_f32_8x4_cortexa9 15 32 5 0.0000068812833008684075 +armv7neon_mmm_f32_8x6_generic 16 4 17 0.000004317183704154257 +armv7neon_mmm_f32_8x6_cortexa9 17 32 7 0.000012443561351360363 +generic_f32_4x4 13 128 7 0.000028155411441347644 +armv7neon_mmm_f32_8x6_cortexa9 25 4 12 0.000005357355009390057 +armv7neon_mmm_f32_8x6_cortexa7 9 128 18 0.00003939231793264078 +generic_f32_4x4 9 128 13 0.00004157559429631316 +armv7neon_mmm_f32_8x6_cortexa9 23 32 11 0.000012894467716591857 +armv7neon_mmm_f32_8x6_generic 17 32 19 0.000023157961942761285 +armv7neon_mmm_f32_8x6_generic 23 128 13 0.00005762004261565962 +armv7neon_mmm_f32_8x6_generic 25 32 12 0.000015419463508057784 +armv7neon_mmm_f32_8x6_generic 23 32 19 0.000023607314480173466 +generic_f32_4x4 11 4 9 0.0000056154966368798884 +armv7neon_mmm_f32_8x4_generic 23 4 3 0.000002512455070432762 +armv7neon_mmm_f32_8x6_generic 17 128 12 0.000038205761728052566 +armv7neon_mmm_f32_8x6_cortexa9 25 4 6 0.0000029441583370270357 +armv7neon_mmm_f32_8x6_cortexa7 8 128 5 0.00000718614713533821 +armv7neon_mmm_f32_8x6_cortexa9 9 32 11 0.000008580295552398777 +armv7neon_mmm_f32_8x4_generic 17 4 8 0.0000036027501506567706 +armv7neon_mmm_f32_8x4_cortexa9 24 128 12 0.000043681280972474085 +armv7neon_mmm_f32_8x6_cortexa7 23 128 19 0.00007892740320659124 +armv7neon_mmm_f32_8x4_cortexa7 7 4 7 0.000001867957586629068 +armv7neon_mmm_f32_8x4_cortexa7 17 128 3 0.000015105393976853567 +armv7neon_mmm_f32_8x6_cortexa9 16 32 5 0.000004737870196682113 +armv7neon_mmm_f32_8x6_cortexa7 17 32 11 0.000012513663992372357 +armv7neon_mmm_f32_8x4_cortexa9 8 4 13 0.000002615891904894993 +generic_f32_4x4 9 128 8 0.00002092560518618388 +armv7neon_mmm_f32_8x6_cortexa9 24 4 13 0.000006118924275544352 +armv7neon_mmm_f32_8x4_cortexa9 23 128 12 0.00004443207751479167 +armv7neon_mmm_f32_8x4_cortexa9 9 32 13 0.000012776770370453443 +armv7neon_mmm_f32_8x4_cortexa7 7 128 4 0.000005416694010439176 +generic_f32_4x4 3 32 3 0.0000017607189056083412 +generic_f32_4x4 8 32 13 0.00000985693332598433 +armv7neon_mmm_f32_8x4_generic 15 128 9 0.00002827090625785394 +armv7neon_mmm_f32_8x6_generic 17 4 5 0.0000026659420735671337 +armv7neon_mmm_f32_8x6_cortexa9 7 4 19 0.000003496314614706576 +generic_f32_4x4 9 32 12 0.00001103802239472411 +armv7neon_mmm_f32_8x6_cortexa7 24 32 19 0.00002361513173387037 +armv7neon_mmm_f32_8x6_cortexa9 24 32 13 0.00001816124851845242 +armv7neon_mmm_f32_8x6_cortexa7 24 4 19 0.000007820954390784165 +armv7neon_mmm_f32_8x4_cortexa9 16 4 9 0.0000037661558813876353 +armv7neon_mmm_f32_8x6_generic 25 4 13 0.000007941364297025098 +armv7neon_mmm_f32_8x6_cortexa7 23 128 7 0.0000397790896264826 +armv7neon_mmm_f32_8x4_cortexa7 23 32 12 0.000014064248486107495 +armv7neon_mmm_f32_8x6_cortexa9 25 32 17 0.000024451766821636896 +armv7neon_mmm_f32_8x4_generic 16 32 13 0.000011863952520137507 +armv7neon_mmm_f32_8x6_cortexa9 15 4 13 0.000004662604065748643 +armv7neon_mmm_f32_8x4_cortexa7 16 128 13 0.00003851383301865242 +armv7neon_mmm_f32_8x4_cortexa9 17 4 3 0.0000024871767077518544 +armv7neon_mmm_f32_8x6_generic 9 128 18 0.000038275747204244 +armv7neon_mmm_f32_8x4_cortexa7 23 4 7 0.000004236340851810833 +armv7neon_mmm_f32_8x4_generic 24 32 12 0.000012855667391841598 +armv7neon_mmm_f32_8x6_cortexa9 15 32 17 0.000012917112497463966 +armv7neon_mmm_f32_8x4_generic 23 128 11 0.000042177094013740186 +armv7neon_mmm_f32_8x4_cortexa7 16 128 12 0.000028624335558632763 +armv7neon_mmm_f32_8x6_cortexa7 23 4 17 0.000006797176857939227 +armv7neon_mmm_f32_8x4_generic 25 4 3 0.0000031059316600251953 +armv7neon_mmm_f32_8x6_cortexa7 23 4 19 0.000008441504854510506 +armv7neon_mmm_f32_8x6_cortexa7 23 128 17 0.00005957716872417335 +armv7neon_mmm_f32_8x4_generic 24 32 3 0.000005255472766158138 +armv7neon_mmm_f32_8x6_cortexa7 7 32 6 0.0000025969460967210575 +generic_f32_4x4 12 128 9 0.00003118081783563989 +armv7neon_mmm_f32_8x6_cortexa7 24 32 7 0.000012316039376583595 +armv7neon_mmm_f32_8x4_cortexa9 9 128 11 0.00002984202658897139 +armv7neon_mmm_f32_8x6_generic 24 4 12 0.00000390465698063092 +armv7neon_mmm_f32_8x6_generic 24 128 5 0.000019825223712031248 +generic_f32_4x4 11 128 12 0.00003129955987835105 +armv7neon_mmm_f32_8x4_generic 15 32 8 0.000006474371340367591 +armv7neon_mmm_f32_8x6_cortexa7 25 128 17 0.00007864370121172857 +armv7neon_mmm_f32_8x4_cortexa7 7 4 9 0.000002506847196975368 +armv7neon_mmm_f32_8x6_cortexa9 25 4 11 0.000005933707367961421 +armv7neon_mmm_f32_8x4_cortexa9 16 4 5 0.000002787654264318038 +armv7neon_mmm_f32_8x6_generic 17 32 13 0.00001761805014832438 +armv7neon_mmm_f32_8x4_cortexa9 7 32 4 0.0000021905945506812955 +armv7neon_mmm_f32_8x6_cortexa9 17 128 11 0.00004027130474721357 +generic_f32_4x4 7 32 9 0.000007912119634032119 +armv7neon_mmm_f32_8x4_generic 23 4 13 0.0000073828945636745854 +generic_f32_4x4 12 128 11 0.00003129061011102201 +armv7neon_mmm_f32_8x6_cortexa7 9 4 5 0.0000019755012039429016 +armv7neon_mmm_f32_8x6_cortexa9 8 4 13 0.000002405709930798169 +armv7neon_mmm_f32_8x6_cortexa9 24 4 7 0.0000044131675061571695 +armv7neon_mmm_f32_8x4_cortexa7 25 128 3 0.00002004605546076412 +armv7neon_mmm_f32_8x6_cortexa7 25 32 11 0.000016504451247846665 +generic_f32_4x4 11 4 3 0.0000024035209703323816 +generic_f32_4x4 5 128 9 0.00002116685987655445 +armv7neon_mmm_f32_8x4_generic 9 32 9 0.000009310383786805071 +armv7neon_mmm_f32_8x6_cortexa7 15 32 12 0.00000858295982977344 +generic_f32_4x4 9 4 9 0.0000055122591860013075 +armv7neon_mmm_f32_8x4_cortexa7 24 128 4 0.000014582241975346975 +armv7neon_mmm_f32_8x6_cortexa7 7 32 11 0.0000047102983594351624 +armv7neon_mmm_f32_8x6_cortexa7 17 32 5 0.000006703902999940797 +armv7neon_mmm_f32_8x4_cortexa9 9 32 12 0.000009569569775041666 +generic_f32_4x4 11 32 4 0.000004102122527868448 +armv7neon_mmm_f32_8x4_cortexa7 23 128 11 0.00004393294052475175 +armv7neon_mmm_f32_8x6_cortexa7 17 128 18 0.00005867315609945908 +armv7neon_mmm_f32_8x6_generic 15 4 19 0.000005888722731281036 +armv7neon_mmm_f32_8x4_cortexa9 25 128 12 0.000058416478283637004 +armv7neon_mmm_f32_8x6_cortexa7 15 128 18 0.00003981402002143062 +armv7neon_mmm_f32_8x6_generic 17 32 11 0.000012156545175144957 +armv7neon_mmm_f32_8x6_cortexa9 17 128 5 0.000020505815042923473 +armv7neon_mmm_f32_8x4_generic 23 4 8 0.0000037989035755978144 +armv7neon_mmm_f32_8x4_cortexa9 23 128 4 0.000015133679307570648 +generic_f32_4x4 12 32 3 0.000004382513702192395 +generic_f32_4x4 12 4 5 0.000003824564176783666 +generic_f32_4x4 5 32 7 0.000005434572832921092 +generic_f32_4x4 3 4 8 0.0000017566767353451639 +armv7neon_mmm_f32_8x6_cortexa7 7 32 13 0.0000067229229930212575 +armv7neon_mmm_f32_8x4_generic 8 4 13 0.0000025649388423337978 +generic_f32_4x4 11 32 12 0.000011236073634740423 +armv7neon_mmm_f32_8x6_generic 25 4 17 0.000008196880964751204 +armv7neon_mmm_f32_8x6_cortexa9 7 32 11 0.000004753650782271882 +armv7neon_mmm_f32_8x4_cortexa9 7 128 4 0.000005529569938400554 +generic_f32_4x4 13 4 13 0.000009131495505295274 +armv7neon_mmm_f32_8x6_cortexa9 9 4 7 0.00000312421222650566 +armv7neon_mmm_f32_8x6_generic 17 32 7 0.000011970222410677647 +armv7neon_mmm_f32_8x4_cortexa7 7 4 3 0.0000011523161892903704 +armv7neon_mmm_f32_8x6_cortexa7 23 128 18 0.00005908293678660039 +armv7neon_mmm_f32_8x4_cortexa9 23 4 13 0.000007508665249371487 +armv7neon_mmm_f32_8x4_cortexa9 7 32 8 0.000003875838102020609 +armv7neon_mmm_f32_8x6_cortexa7 24 32 11 0.00001256332295885272 +armv7neon_mmm_f32_8x6_cortexa9 23 128 6 0.000020319056425196668 +armv7neon_mmm_f32_8x4_generic 23 32 4 0.00000488125169723422 +armv7neon_mmm_f32_8x6_generic 24 32 18 0.00001705594140661088 +armv7neon_mmm_f32_8x4_generic 7 4 3 0.000001143118878212571 +armv7neon_mmm_f32_8x4_generic 24 4 3 0.000002536821660220204 +armv7neon_mmm_f32_8x6_cortexa9 7 4 17 0.000002840883149941229 +armv7neon_mmm_f32_8x6_cortexa7 17 32 13 0.000018113836070828896 +armv7neon_mmm_f32_8x6_cortexa7 24 4 5 0.000002893802636827323 +armv7neon_mmm_f32_8x4_cortexa9 7 128 8 0.00001055308814900546 +armv7neon_mmm_f32_8x4_cortexa9 15 4 4 0.0000017017415253459248 +armv7neon_mmm_f32_8x6_generic 9 128 17 0.00003847119775045202 +armv7neon_mmm_f32_8x4_cortexa9 17 128 7 0.00003001254835210308 +generic_f32_4x4 7 128 3 0.0000075145364744355526 +armv7neon_mmm_f32_8x6_cortexa9 24 128 13 0.00005963153865876477 +armv7neon_mmm_f32_8x6_generic 16 4 11 0.0000031955847464919697 +armv7neon_mmm_f32_8x6_generic 24 128 17 0.000057273707239946334 +armv7neon_mmm_f32_8x6_cortexa9 16 128 17 0.00004016422558583101 +armv7neon_mmm_f32_8x4_generic 24 4 7 0.000003971324732226714 +armv7neon_mmm_f32_8x6_cortexa9 25 32 11 0.0000166679830493271 +armv7neon_mmm_f32_8x4_generic 7 128 3 0.000005169798410902207 +armv7neon_mmm_f32_8x6_cortexa7 15 4 11 0.000003482445418673263 +armv7neon_mmm_f32_8x6_cortexa7 8 128 6 0.0000069109869984566186 +armv7neon_mmm_f32_8x4_cortexa9 24 32 13 0.000018445950950001155 +armv7neon_mmm_f32_8x6_generic 8 4 11 0.0000018570966900607452 +armv7neon_mmm_f32_8x6_cortexa9 24 4 6 0.000002228168146536169 +armv7neon_mmm_f32_8x6_generic 8 128 7 0.000013146479087947972 +armv7neon_mmm_f32_8x6_cortexa9 25 32 12 0.000016067383961246167 +armv7neon_mmm_f32_8x4_cortexa9 16 32 5 0.000006662778599544951 +armv7neon_mmm_f32_8x6_generic 9 4 11 0.0000031451298378122295 +armv7neon_mmm_f32_8x4_generic 16 32 7 0.000006438918321306648 +armv7neon_mmm_f32_8x6_cortexa9 7 32 7 0.000004657116192635584 +armv7neon_mmm_f32_8x6_generic 17 32 12 0.00001176030032709091 +armv7neon_mmm_f32_8x4_generic 15 4 8 0.0000028163110410039656 +armv7neon_mmm_f32_8x4_cortexa9 7 4 9 0.0000025260788973778824 +generic_f32_4x4 8 4 11 0.000003795516523798154 +armv7neon_mmm_f32_8x6_generic 15 32 6 0.0000044168833936544535 +armv7neon_mmm_f32_8x4_cortexa7 8 128 9 0.000014761720958613642 +armv7neon_mmm_f32_8x6_cortexa7 9 32 7 0.000008410754850495816 +armv7neon_mmm_f32_8x4_cortexa7 16 4 13 0.000004684953060253782 +armv7neon_mmm_f32_8x4_generic 16 32 5 0.000006354175934259964 +armv7neon_mmm_f32_8x4_generic 24 4 4 0.0000019148396982311507 +armv7neon_mmm_f32_8x4_generic 17 128 4 0.000014107689106847269 +armv7neon_mmm_f32_8x6_cortexa9 24 32 11 0.000012681731796133347 +generic_f32_4x4 3 4 5 0.0000017351550450721446 +armv7neon_mmm_f32_8x4_cortexa7 16 4 9 0.0000037512996244278985 +armv7neon_mmm_f32_8x6_cortexa9 23 32 13 0.000018600736039744996 +generic_f32_4x4 7 32 13 0.000010363567629711427 +generic_f32_4x4 7 128 13 0.000028209372228861907 +armv7neon_mmm_f32_8x6_generic 24 32 13 0.000017489604886914003 +armv7neon_mmm_f32_8x4_cortexa7 24 32 8 0.00000906737294441132 +armv7neon_mmm_f32_8x6_generic 15 32 5 0.000004544284306242495 +armv7neon_mmm_f32_8x6_cortexa9 16 128 19 0.00005301309753082383 +armv7neon_mmm_f32_8x6_cortexa7 15 128 11 0.000026895954754680612 +armv7neon_mmm_f32_8x4_cortexa9 15 32 12 0.000009927381424295026 +armv7neon_mmm_f32_8x4_cortexa9 16 128 12 0.00002922630339731252 +armv7neon_mmm_f32_8x4_cortexa9 24 4 11 0.000005481062579336857 +armv7neon_mmm_f32_8x6_generic 7 128 18 0.000019821467429857938 +armv7neon_mmm_f32_8x6_cortexa9 7 32 5 0.000002575926161215081 +armv7neon_mmm_f32_8x6_cortexa7 15 32 7 0.000008595071585263579 +armv7neon_mmm_f32_8x4_cortexa9 17 32 4 0.000005001583148868544 +armv7neon_mmm_f32_8x4_cortexa7 25 32 4 0.000006334740769256929 +armv7neon_mmm_f32_8x4_cortexa7 25 128 7 0.00003905527355458101 +generic_f32_4x4 4 4 5 0.0000016159570864189527 +armv7neon_mmm_f32_8x4_cortexa9 17 128 11 0.000044522329607090446 +armv7neon_mmm_f32_8x6_cortexa9 8 128 18 0.0000200309695734825 +armv7neon_mmm_f32_8x6_generic 24 32 19 0.000022973011330361666 +generic_f32_4x4 3 4 13 0.000002984204583865987 +armv7neon_mmm_f32_8x6_cortexa9 15 4 5 0.000002015871636210361 +generic_f32_4x4 7 4 9 0.000003999946420645267 +armv7neon_mmm_f32_8x4_generic 7 4 9 0.0000024834088306212943 +armv7neon_mmm_f32_8x4_cortexa7 25 128 9 0.00005782975703195741 +armv7neon_mmm_f32_8x4_generic 25 32 8 0.000011779346411467398 +armv7neon_mmm_f32_8x6_cortexa9 25 32 7 0.000016399753838712434 +armv7neon_mmm_f32_8x6_generic 24 4 6 0.000002204607070591687 +armv7neon_mmm_f32_8x6_cortexa7 9 32 17 0.000012402833540807695 +armv7neon_mmm_f32_8x4_cortexa7 25 128 13 0.00007672799293438395 +armv7neon_mmm_f32_8x6_generic 24 128 12 0.0000379228420372398 +armv7neon_mmm_f32_8x4_cortexa7 16 128 3 0.00001030544963732427 +armv7neon_mmm_f32_8x4_generic 15 128 3 0.000009885955298583161 +armv7neon_mmm_f32_8x4_generic 15 4 12 0.0000039769457516235065 +armv7neon_mmm_f32_8x6_generic 23 4 11 0.000004727342544050073 +armv7neon_mmm_f32_8x6_cortexa7 15 4 12 0.0000032858094110604857 +armv7neon_mmm_f32_8x4_cortexa7 16 4 3 0.0000018824484779381146 +armv7neon_mmm_f32_8x6_cortexa9 23 4 7 0.000004580846920018228 +generic_f32_4x4 11 4 8 0.00000379091798312253 +armv7neon_mmm_f32_8x6_cortexa9 23 32 18 0.00001843667314281391 +armv7neon_mmm_f32_8x6_generic 8 32 18 0.0000060026095209057905 +armv7neon_mmm_f32_8x4_cortexa7 8 32 7 0.00000356603711659208 +armv7neon_mmm_f32_8x6_generic 15 128 18 0.00003865113024109526 +armv7neon_mmm_f32_8x6_generic 7 32 5 0.0000025012501670870884 +generic_f32_4x4 12 128 3 0.00001108291043110956 +armv7neon_mmm_f32_8x6_cortexa9 9 128 17 0.00004019255842003589 +armv7neon_mmm_f32_8x4_cortexa9 15 128 3 0.000010489952376847188 +armv7neon_mmm_f32_8x4_cortexa9 7 4 5 0.0000018283099982388463 +armv7neon_mmm_f32_8x6_cortexa7 23 32 13 0.000018410568030938504 +armv7neon_mmm_f32_8x6_generic 15 4 17 0.000004759242853957918 +armv7neon_mmm_f32_8x6_cortexa7 17 128 5 0.00002027503703993735 +armv7neon_mmm_f32_8x4_generic 7 4 12 0.000002526741586689988 +armv7neon_mmm_f32_8x6_cortexa9 17 4 13 0.000006256702206265106 +armv7neon_mmm_f32_8x6_cortexa7 16 128 13 0.000039449772577428465 +generic_f32_4x4 5 32 4 0.0000028896370502300425 +armv7neon_mmm_f32_8x6_cortexa9 23 4 6 0.000002496256395769302 +armv7neon_mmm_f32_8x4_generic 17 4 9 0.000005469161034223508 +armv7neon_mmm_f32_8x4_generic 17 4 13 0.0000070099392970628585 +armv7neon_mmm_f32_8x6_generic 24 4 17 0.000006212264617752923 +armv7neon_mmm_f32_8x6_cortexa7 23 32 18 0.000018244504523002484 +armv7neon_mmm_f32_8x4_cortexa7 17 32 3 0.000005314636591625876 +armv7neon_mmm_f32_8x6_generic 23 4 13 0.00000640173388890707 +armv7neon_mmm_f32_8x6_cortexa7 25 4 19 0.00001046601238496823 +armv7neon_mmm_f32_8x4_cortexa7 9 128 5 0.000019669882407282677 +generic_f32_4x4 5 4 3 0.0000017330186385522962 +armv7neon_mmm_f32_8x6_generic 9 4 13 0.000004297017223755962 +armv7neon_mmm_f32_8x4_cortexa9 9 128 7 0.0000201239354954616 +armv7neon_mmm_f32_8x4_cortexa7 8 4 7 0.0000016739241733780589 +armv7neon_mmm_f32_8x6_cortexa7 7 4 6 0.000001271018362845475 +generic_f32_4x4 13 32 9 0.000014839920846066823 +armv7neon_mmm_f32_8x4_generic 25 4 9 0.000007058701573468786 +armv7neon_mmm_f32_8x4_cortexa7 7 32 5 0.000003729557835063842 +armv7neon_mmm_f32_8x6_cortexa9 7 128 5 0.000007173464418699434 +armv7neon_mmm_f32_8x6_cortexa7 24 128 13 0.00005881823134405547 +armv7neon_mmm_f32_8x6_cortexa9 17 32 12 0.000012231167345130436 +armv7neon_mmm_f32_8x4_cortexa7 8 4 4 0.0000009861752945826601 +armv7neon_mmm_f32_8x4_cortexa9 23 128 3 0.00001548775471015623 +armv7neon_mmm_f32_8x4_generic 8 128 9 0.000014166892351007583 +armv7neon_mmm_f32_8x4_cortexa9 25 32 3 0.000007055926590582058 +armv7neon_mmm_f32_8x6_cortexa7 8 32 6 0.0000023905642959740767 +armv7neon_mmm_f32_8x4_generic 24 4 12 0.000004705581936932163 +generic_f32_4x4 13 32 13 0.000019466458499229495 +armv7neon_mmm_f32_8x6_cortexa7 8 128 12 0.000013354945269855691 +generic_f32_4x4 4 32 8 0.000002784564616216757 +generic_f32_4x4 11 4 13 0.000007235980721358389 +armv7neon_mmm_f32_8x4_cortexa9 15 4 9 0.000004211132231298468 +armv7neon_mmm_f32_8x4_generic 23 4 7 0.000004197975758631663 +armv7neon_mmm_f32_8x4_generic 24 4 13 0.000006669346538195287 +generic_f32_4x4 9 4 11 0.000005605492408222339 +armv7neon_mmm_f32_8x4_cortexa9 7 4 7 0.0000018866714926916992 +armv7neon_mmm_f32_8x6_generic 7 4 12 0.00000203404671528183 +armv7neon_mmm_f32_8x6_cortexa9 16 32 12 0.000008156581758904169 +armv7neon_mmm_f32_8x6_generic 9 4 19 0.000005498641044027106 +generic_f32_4x4 7 4 8 0.000002762174951002869 +armv7neon_mmm_f32_8x4_cortexa9 16 4 3 0.0000019059695727093323 +armv7neon_mmm_f32_8x4_generic 25 4 12 0.00000651099979096713 +armv7neon_mmm_f32_8x6_cortexa7 24 128 5 0.000020414031398736494 +armv7neon_mmm_f32_8x6_cortexa7 7 128 18 0.000020361101695155362 +generic_f32_4x4 11 128 5 0.00002125700072345171 +armv7neon_mmm_f32_8x6_cortexa7 17 32 18 0.000017837908064150246 +armv7neon_mmm_f32_8x6_cortexa7 25 32 13 0.00002390699968855355 +armv7neon_mmm_f32_8x4_generic 23 4 5 0.0000040641271507242515 +generic_f32_4x4 8 4 3 0.0000017865876036353628 +armv7neon_mmm_f32_8x6_cortexa9 16 4 17 0.00000442887972122732 +armv7neon_mmm_f32_8x4_cortexa9 25 32 8 0.000012384930380190053 +generic_f32_4x4 5 4 5 0.000002798692518373374 +armv7neon_mmm_f32_8x6_cortexa9 8 128 6 0.0000070012080134920035 +armv7neon_mmm_f32_8x6_cortexa7 17 32 6 0.000006327458516647519 +armv7neon_mmm_f32_8x4_cortexa7 24 32 7 0.0000097071140215999 +armv7neon_mmm_f32_8x6_generic 23 32 12 0.000012023256652212629 +armv7neon_mmm_f32_8x6_generic 9 32 11 0.000008252674922417165 +armv7neon_mmm_f32_8x6_generic 9 32 6 0.000004271117675286997 +armv7neon_mmm_f32_8x4_cortexa7 7 4 4 0.0000011809046933853917 +armv7neon_mmm_f32_8x6_cortexa7 15 32 19 0.00001661331304395181 +armv7neon_mmm_f32_8x4_generic 25 128 5 0.00003728262011412803 +armv7neon_mmm_f32_8x6_cortexa9 17 128 19 0.00007942199229108003 +generic_f32_4x4 11 128 9 0.000031521930244918895 +armv7neon_mmm_f32_8x6_cortexa7 25 4 6 0.000002951526384640025 +armv7neon_mmm_f32_8x6_generic 25 128 18 0.00007559930226918245 +armv7neon_mmm_f32_8x6_generic 24 32 7 0.000011956301096424158 +armv7neon_mmm_f32_8x4_cortexa7 24 4 4 0.000001942280219822285 +armv7neon_mmm_f32_8x4_cortexa7 16 4 5 0.000002773541103005484 +generic_f32_4x4 13 4 7 0.000005120137461912069 +armv7neon_mmm_f32_8x4_generic 9 4 9 0.000003880248068324046 +armv7neon_mmm_f32_8x4_cortexa9 8 4 5 0.0000016579504301247745 +armv7neon_mmm_f32_8x6_cortexa7 7 32 18 0.000006779881862785529 +armv7neon_mmm_f32_8x4_generic 24 32 9 0.000013438884904462624 +armv7neon_mmm_f32_8x4_generic 17 32 11 0.000013721051068287207 +armv7neon_mmm_f32_8x4_cortexa7 23 32 3 0.000005392987682793705 +armv7neon_mmm_f32_8x6_cortexa9 17 4 18 0.000005996181221285945 +armv7neon_mmm_f32_8x4_generic 23 128 12 0.000041706025206470595 +armv7neon_mmm_f32_8x4_generic 25 128 7 0.00003743774855697319 +generic_f32_4x4 3 4 12 0.000002379029629256848 +armv7neon_mmm_f32_8x6_cortexa9 24 128 18 0.00005914765901143539 +armv7neon_mmm_f32_8x4_cortexa7 23 32 5 0.00000981049054917298 +armv7neon_mmm_f32_8x6_cortexa7 16 4 19 0.000005412079670267658 +armv7neon_mmm_f32_8x4_cortexa7 8 32 4 0.0000019274117652778804 +armv7neon_mmm_f32_8x6_generic 17 128 6 0.000019306808921524692 +armv7neon_mmm_f32_8x6_cortexa7 7 128 12 0.000013761467144605992 +armv7neon_mmm_f32_8x6_cortexa7 16 32 7 0.000008404964853562951 +armv7neon_mmm_f32_8x6_cortexa9 9 32 7 0.000008475033766481017 +armv7neon_mmm_f32_8x6_cortexa9 15 128 5 0.000013894752583522413 +armv7neon_mmm_f32_8x6_generic 16 128 17 0.000038424998460484644 +armv7neon_mmm_f32_8x4_cortexa7 8 32 9 0.00000497069436204451 +armv7neon_mmm_f32_8x6_generic 9 32 5 0.000004444687551155496 +armv7neon_mmm_f32_8x4_cortexa7 24 32 5 0.00000958372113096483 +armv7neon_mmm_f32_8x6_cortexa9 24 4 12 0.000003962114122866607 +armv7neon_mmm_f32_8x4_generic 15 4 11 0.000004231967658632821 +armv7neon_mmm_f32_8x6_cortexa7 24 32 13 0.000017973816622821935 +armv7neon_mmm_f32_8x4_cortexa9 23 32 13 0.000019157368846217136 +armv7neon_mmm_f32_8x4_generic 16 32 9 0.00000912682021206548 +armv7neon_mmm_f32_8x6_generic 16 32 5 0.000004588157612379825 +armv7neon_mmm_f32_8x6_generic 16 4 13 0.0000041677198842138695 +armv7neon_mmm_f32_8x6_generic 15 128 17 0.00003886891043603058 +armv7neon_mmm_f32_8x6_cortexa9 7 128 11 0.000013961383587786538 +armv7neon_mmm_f32_8x4_generic 7 128 5 0.000009860600106626588 +armv7neon_mmm_f32_8x4_cortexa9 16 4 12 0.0000033867538649589902 +armv7neon_mmm_f32_8x6_cortexa9 17 32 19 0.000024087066037173788 +armv7neon_mmm_f32_8x6_generic 9 128 5 0.000013189504068813325 +armv7neon_mmm_f32_8x6_cortexa9 8 32 18 0.000006222642744442069 +armv7neon_mmm_f32_8x6_cortexa7 24 32 12 0.000011846425063097506 +generic_f32_4x4 5 4 13 0.000004993325421206121 +armv7neon_mmm_f32_8x6_generic 8 128 11 0.000013157428320463883 +generic_f32_4x4 7 32 4 0.0000029484328830194284 +armv7neon_mmm_f32_8x4_cortexa9 7 32 9 0.000005475212517453128 +armv7neon_mmm_f32_8x4_cortexa9 9 128 13 0.00003949164898061412 +armv7neon_mmm_f32_8x4_cortexa9 23 4 11 0.000005947124934955378 +armv7neon_mmm_f32_8x6_generic 23 32 11 0.00001239533186039336 +armv7neon_mmm_f32_8x6_generic 7 32 17 0.000006661670995420206 +armv7neon_mmm_f32_8x4_cortexa7 7 4 8 0.0000018776318580783529 +armv7neon_mmm_f32_8x6_generic 8 32 5 0.0000025552685069830137 +armv7neon_mmm_f32_8x4_cortexa9 9 32 9 0.000009774693013502346 +generic_f32_4x4 13 128 5 0.000027995490696525238 +armv7neon_mmm_f32_8x4_cortexa9 8 4 11 0.0000021722237770380927 +armv7neon_mmm_f32_8x4_generic 23 128 9 0.0000419391925884448 +armv7neon_mmm_f32_8x4_cortexa9 25 32 7 0.000013031732285626503 +armv7neon_mmm_f32_8x4_cortexa7 25 32 12 0.000017966835523846677 +armv7neon_mmm_f32_8x4_cortexa7 16 32 11 0.000009494335431932482 +armv7neon_mmm_f32_8x6_cortexa7 17 128 19 0.0000783675036460944 +armv7neon_mmm_f32_8x6_cortexa7 17 4 7 0.0000044328101817797105 +generic_f32_4x4 3 32 8 0.000003088179719444515 +generic_f32_4x4 12 4 11 0.000005408469300584859 +armv7neon_mmm_f32_8x4_cortexa7 23 32 13 0.000018862031131309997 +armv7neon_mmm_f32_8x4_generic 23 32 11 0.000014024445889402156 +generic_f32_4x4 4 128 11 0.00001075990813733738 +armv7neon_mmm_f32_8x6_generic 24 4 7 0.000004301779173383875 +armv7neon_mmm_f32_8x6_cortexa9 25 128 12 0.000052850505948183045 +armv7neon_mmm_f32_8x4_generic 15 32 4 0.0000034946744383909553 +armv7neon_mmm_f32_8x4_generic 17 4 3 0.0000024363476636608272 +armv7neon_mmm_f32_8x4_cortexa7 16 128 4 0.000009891301569266948 +armv7neon_mmm_f32_8x6_cortexa9 25 4 13 0.000008105640345047328 +armv7neon_mmm_f32_8x4_cortexa9 16 4 4 0.000001487225488304516 +generic_f32_4x4 5 32 9 0.000007777979342543209 +armv7neon_mmm_f32_8x6_cortexa7 8 4 19 0.0000029885727357209406 +generic_f32_4x4 8 32 9 0.000007585061806416855 +armv7neon_mmm_f32_8x4_cortexa7 17 128 13 0.000057643034032365925 +armv7neon_mmm_f32_8x4_generic 7 128 4 0.000005216038661083481 +armv7neon_mmm_f32_8x4_cortexa7 16 32 12 0.000009049743188132303 +armv7neon_mmm_f32_8x4_cortexa9 17 4 8 0.000003682177568678712 +armv7neon_mmm_f32_8x4_cortexa9 17 32 11 0.000014400917572892256 +armv7neon_mmm_f32_8x4_cortexa9 15 4 12 0.0000040529055101759995 +armv7neon_mmm_f32_8x4_cortexa7 15 4 11 0.000004271052070252677 +armv7neon_mmm_f32_8x6_cortexa7 16 128 5 0.000013791269931666714 +armv7neon_mmm_f32_8x6_generic 8 32 7 0.000004325189484240455 +generic_f32_4x4 3 128 7 0.000007522472627920054 +armv7neon_mmm_f32_8x6_cortexa9 9 32 6 0.000004463650285453612 +generic_f32_4x4 11 32 7 0.000007948963253145964 +armv7neon_mmm_f32_8x4_cortexa9 24 128 4 0.000014890215129529216 +armv7neon_mmm_f32_8x6_cortexa9 16 4 19 0.000005414025318591343 +armv7neon_mmm_f32_8x6_cortexa7 17 4 6 0.00000238128254631235 +armv7neon_mmm_f32_8x4_cortexa9 24 4 5 0.000003916194410369112 +armv7neon_mmm_f32_8x4_cortexa9 8 128 9 0.000015067666888020707 +armv7neon_mmm_f32_8x6_cortexa7 23 32 11 0.000012775953874196187 +generic_f32_4x4 8 32 3 0.0000030910360857362123 +armv7neon_mmm_f32_8x4_cortexa7 7 32 3 0.0000021042104056045423 +armv7neon_mmm_f32_8x6_cortexa9 25 32 5 0.000008813546121201035 +armv7neon_mmm_f32_8x6_cortexa7 25 4 17 0.000008369215065652083 +generic_f32_4x4 3 32 9 0.000004340286298208673 +armv7neon_mmm_f32_8x4_generic 17 4 5 0.000003911088956924729 +generic_f32_4x4 9 4 13 0.000007067880206932553 +armv7neon_mmm_f32_8x4_cortexa7 8 128 13 0.00001944164572907436 +armv7neon_mmm_f32_8x4_cortexa7 25 4 8 0.000004598061889383109 +armv7neon_mmm_f32_8x4_generic 15 32 5 0.000006570904917754489 +armv7neon_mmm_f32_8x4_cortexa9 7 4 4 0.000001199774667010064 +armv7neon_mmm_f32_8x4_cortexa9 16 32 9 0.000009580069903775671 +armv7neon_mmm_f32_8x6_generic 17 4 19 0.000007879961865942272 +armv7neon_mmm_f32_8x4_cortexa7 23 128 13 0.00005805408197282095 +armv7neon_mmm_f32_8x6_generic 25 32 13 0.000023213828889712974 +armv7neon_mmm_f32_8x6_cortexa9 23 4 17 0.000006797095061554012 +armv7neon_mmm_f32_8x6_cortexa9 24 32 6 0.000006232031189380453 +armv7neon_mmm_f32_8x4_cortexa7 15 128 11 0.00002955574134898906 +armv7neon_mmm_f32_8x6_cortexa9 24 4 11 0.000004626561901203258 +armv7neon_mmm_f32_8x6_cortexa9 8 4 7 0.0000018344586562526734 +armv7neon_mmm_f32_8x6_generic 15 4 18 0.000004518763751153566 +armv7neon_mmm_f32_8x6_cortexa9 16 128 11 0.000027030003785723685 +armv7neon_mmm_f32_8x4_cortexa7 9 4 4 0.0000015888326511808313 +armv7neon_mmm_f32_8x4_cortexa7 7 4 12 0.0000025479152659232096 +armv7neon_mmm_f32_8x4_cortexa7 23 128 7 0.000029563915293732792 +armv7neon_mmm_f32_8x4_cortexa9 25 4 13 0.000009216194666264275 +generic_f32_4x4 4 4 7 0.0000016464048874437295 +armv7neon_mmm_f32_8x4_cortexa7 7 128 11 0.00001525304959159917 +armv7neon_mmm_f32_8x4_cortexa9 9 4 9 0.000003956900236836743 +generic_f32_4x4 4 128 8 0.000007233357504684728 +armv7neon_mmm_f32_8x4_cortexa9 16 128 9 0.000029612198853914824 +armv7neon_mmm_f32_8x4_generic 15 128 12 0.000028154522908512787 +armv7neon_mmm_f32_8x6_cortexa9 8 128 5 0.000007227187608771384 +armv7neon_mmm_f32_8x4_cortexa9 25 128 7 0.000039766026339765396 +generic_f32_4x4 9 128 5 0.00002114949804637606 +armv7neon_mmm_f32_8x4_generic 17 32 4 0.000004758199768605913 +armv7neon_mmm_f32_8x6_cortexa9 7 4 13 0.0000027386176506007003 +generic_f32_4x4 7 32 8 0.000005375358081458721 +armv7neon_mmm_f32_8x6_cortexa9 8 4 18 0.000002220900812833037 +armv7neon_mmm_f32_8x4_cortexa9 16 128 11 0.000029669482734966696 +armv7neon_mmm_f32_8x6_cortexa7 8 32 18 0.000006161276860676508 +armv7neon_mmm_f32_8x4_cortexa7 8 32 8 0.000003353396080834466 +armv7neon_mmm_f32_8x6_generic 16 128 13 0.000038174416411990654 +armv7neon_mmm_f32_8x6_generic 15 32 19 0.00001612698422472605 +armv7neon_mmm_f32_8x4_cortexa9 9 128 3 0.000010402587362113396 +armv7neon_mmm_f32_8x6_cortexa9 9 4 11 0.0000032196424297717835 +armv7neon_mmm_f32_8x4_cortexa9 8 128 11 0.000015086238127499093 +armv7neon_mmm_f32_8x6_generic 16 128 19 0.00005061640609152132 +armv7neon_mmm_f32_8x4_generic 16 128 9 0.00002782092751958622 +generic_f32_4x4 4 32 4 0.000001643238458184986 +armv7neon_mmm_f32_8x6_cortexa7 17 4 11 0.0000046002558984855774 +generic_f32_4x4 5 4 9 0.0000039003390490700075 +generic_f32_4x4 9 128 7 0.000021241983402647673 +generic_f32_4x4 7 4 13 0.000005155057897108874 +armv7neon_mmm_f32_8x6_generic 17 4 17 0.000006303527455540143 +armv7neon_mmm_f32_8x4_generic 16 4 13 0.000004620655887935485 +armv7neon_mmm_f32_8x4_cortexa7 8 128 5 0.000010060854723538398 +armv7neon_mmm_f32_8x4_cortexa9 16 32 11 0.000009653502461801779 +armv7neon_mmm_f32_8x6_generic 23 128 5 0.00001973212006224306 +armv7neon_mmm_f32_8x4_generic 8 4 7 0.000001657225281211141 +armv7neon_mmm_f32_8x6_generic 16 4 12 0.0000027740993216529004 +armv7neon_mmm_f32_8x6_cortexa9 15 128 18 0.00004029185998372536 +armv7neon_mmm_f32_8x4_cortexa9 25 32 4 0.000006456592320044242 +armv7neon_mmm_f32_8x6_cortexa7 25 128 12 0.000052118173447989496 +armv7neon_mmm_f32_8x4_cortexa7 25 32 9 0.000018537854346104505 +armv7neon_mmm_f32_8x6_cortexa7 16 4 7 0.000003147284440858119 +armv7neon_mmm_f32_8x4_generic 17 128 5 0.000028048493049013062 +armv7neon_mmm_f32_8x6_cortexa7 23 4 18 0.000006335447404352516 +armv7neon_mmm_f32_8x6_cortexa9 24 4 19 0.000007813810057423332 +armv7neon_mmm_f32_8x4_cortexa7 17 128 8 0.000028915520714406696 +armv7neon_mmm_f32_8x4_cortexa7 17 128 7 0.000029363768223094683 +generic_f32_4x4 3 32 11 0.0000043748460947431475 +armv7neon_mmm_f32_8x4_cortexa9 9 128 5 0.0000200735894165016 +armv7neon_mmm_f32_8x4_cortexa7 17 4 3 0.000002458111098809617 +generic_f32_4x4 5 128 11 0.0000211987993849004 +generic_f32_4x4 12 32 7 0.00000782487986470221 +armv7neon_mmm_f32_8x4_generic 7 128 11 0.000014659497491826627 +armv7neon_mmm_f32_8x6_cortexa7 17 128 11 0.00003969667780371537 +armv7neon_mmm_f32_8x6_generic 7 4 6 0.0000012612776856940052 +armv7neon_mmm_f32_8x6_cortexa9 7 4 6 0.0000012711169589040185 +armv7neon_mmm_f32_8x6_cortexa7 8 4 13 0.0000024259808837385263 +armv7neon_mmm_f32_8x4_cortexa7 24 128 9 0.000043285670969312205 +armv7neon_mmm_f32_8x6_generic 15 128 19 0.00005126058311511034 +generic_f32_4x4 13 128 11 0.00004170340313011054 +armv7neon_mmm_f32_8x6_cortexa9 8 128 12 0.000013510136481590433 +armv7neon_mmm_f32_8x6_generic 23 32 17 0.00001815155326925384 +armv7neon_mmm_f32_8x4_generic 24 32 8 0.000008758373656325976 +armv7neon_mmm_f32_8x4_generic 15 32 13 0.000012566353080909321 +armv7neon_mmm_f32_8x4_generic 15 4 9 0.000004138286673960469 +armv7neon_mmm_f32_8x4_cortexa7 16 32 4 0.000003365053985432853 +armv7neon_mmm_f32_8x4_cortexa7 9 128 3 0.0000101978289798725 +generic_f32_4x4 11 32 8 0.000007681371193157391 +generic_f32_4x4 7 128 9 0.00002127278439974851 +armv7neon_mmm_f32_8x4_cortexa7 24 4 13 0.000006759425188306811 +armv7neon_mmm_f32_8x4_cortexa7 24 32 9 0.000013896132257150766 +armv7neon_mmm_f32_8x4_cortexa9 25 32 9 0.000018829874971593756 +armv7neon_mmm_f32_8x4_cortexa7 9 32 4 0.000003474275688786252 +armv7neon_mmm_f32_8x6_cortexa7 8 32 7 0.0000044833206937891445 +generic_f32_4x4 3 32 7 0.0000030714723992174605 +armv7neon_mmm_f32_8x6_cortexa9 17 4 11 0.000004585922035285582 +armv7neon_mmm_f32_8x4_cortexa9 24 128 5 0.000029781338716311416 +armv7neon_mmm_f32_8x6_generic 15 32 18 0.000012214298327316169 +armv7neon_mmm_f32_8x6_generic 25 32 6 0.000007957229153899586 +armv7neon_mmm_f32_8x6_cortexa9 9 32 18 0.000012307833924250695 +armv7neon_mmm_f32_8x6_generic 8 32 13 0.000006159219915111089 +armv7neon_mmm_f32_8x4_cortexa7 8 128 3 0.000005401956819775254 +armv7neon_mmm_f32_8x4_generic 15 32 9 0.000009577293347035024 +armv7neon_mmm_f32_8x4_cortexa9 25 128 13 0.00007830393224178603 +generic_f32_4x4 4 4 13 0.0000026249296920626683 +armv7neon_mmm_f32_8x6_cortexa9 8 32 6 0.000002404882860716393 +armv7neon_mmm_f32_8x6_cortexa9 9 32 5 0.0000045921033389599396 +armv7neon_mmm_f32_8x6_generic 17 128 11 0.000038492804993535863 +armv7neon_mmm_f32_8x4_generic 17 32 9 0.000013624172849176604 +armv7neon_mmm_f32_8x4_cortexa9 24 4 9 0.000005393334776799241 +armv7neon_mmm_f32_8x4_cortexa9 23 32 8 0.000009723809855547939 +armv7neon_mmm_f32_8x4_cortexa9 25 4 4 0.0000025900920077984026 +armv7neon_mmm_f32_8x6_generic 23 4 5 0.0000027608446978317408 +armv7neon_mmm_f32_8x4_generic 8 4 12 0.0000019060496583230645 +armv7neon_mmm_f32_8x4_cortexa7 15 32 8 0.000006678781571694655 +generic_f32_4x4 4 4 9 0.0000021081574199754717 +armv7neon_mmm_f32_8x4_cortexa7 17 128 9 0.00004346458158147509 +armv7neon_mmm_f32_8x4_generic 25 4 8 0.000004544403472760322 +armv7neon_mmm_f32_8x4_generic 24 128 4 0.000013981279265161887 +armv7neon_mmm_f32_8x6_cortexa7 25 4 12 0.000005353769509302435 +armv7neon_mmm_f32_8x4_cortexa9 15 4 8 0.000002871100263594495 +armv7neon_mmm_f32_8x6_cortexa7 9 4 12 0.0000030500030063725167 +armv7neon_mmm_f32_8x4_cortexa9 15 128 7 0.000020348668200781455 +generic_f32_4x4 4 4 3 0.0000011380263710773134 +armv7neon_mmm_f32_8x4_generic 17 32 7 0.000009461004385359903 +armv7neon_mmm_f32_8x6_cortexa7 16 4 11 0.000003292952568894936 +armv7neon_mmm_f32_8x4_generic 9 128 7 0.0000189264627670604 +armv7neon_mmm_f32_8x4_cortexa7 9 32 12 0.000009404582356065284 +armv7neon_mmm_f32_8x6_cortexa7 24 4 11 0.000004645745440552646 +armv7neon_mmm_f32_8x4_generic 7 4 8 0.0000018628381186813514 +generic_f32_4x4 13 32 12 0.000014451033360231452 +armv7neon_mmm_f32_8x6_generic 7 128 11 0.00001335338354211472 +armv7neon_mmm_f32_8x4_cortexa9 24 32 5 0.000009736881086291142 +armv7neon_mmm_f32_8x4_generic 9 32 8 0.000006244209351860461 +armv7neon_mmm_f32_8x4_cortexa7 23 32 7 0.000009963814618687657 +armv7neon_mmm_f32_8x6_cortexa7 24 128 7 0.000039514548121680776 +armv7neon_mmm_f32_8x4_cortexa7 17 32 5 0.000009649055437118146 +armv7neon_mmm_f32_8x6_generic 7 128 7 0.000013256390141337729 +armv7neon_mmm_f32_8x6_cortexa7 9 128 17 0.00003956707654525446 +armv7neon_mmm_f32_8x4_generic 9 128 8 0.000018708339822835962 +generic_f32_4x4 9 4 7 0.00000399084452417743 +armv7neon_mmm_f32_8x4_generic 15 128 8 0.000018936209183566976 +armv7neon_mmm_f32_8x6_cortexa7 9 32 19 0.00001616084997042782 +armv7neon_mmm_f32_8x6_generic 16 128 5 0.00001333369244237612 +armv7neon_mmm_f32_8x4_cortexa7 9 32 3 0.0000036715654135840626 +armv7neon_mmm_f32_8x4_cortexa9 23 32 5 0.000009965038150728467 +armv7neon_mmm_f32_8x6_cortexa7 9 128 7 0.000026494406670274787 +armv7neon_mmm_f32_8x4_cortexa9 24 128 13 0.00005851957886711075 +armv7neon_mmm_f32_8x6_cortexa7 9 4 18 0.000004273474311217914 +generic_f32_4x4 5 32 12 0.000007635355402928665 +armv7neon_mmm_f32_8x4_cortexa9 7 128 7 0.000010528249080204253 +armv7neon_mmm_f32_8x6_cortexa9 24 32 19 0.00002386884326098149 +armv7neon_mmm_f32_8x6_generic 8 4 7 0.0000017826372979170567 +armv7neon_mmm_f32_8x4_generic 7 4 5 0.000001794625708774814 +armv7neon_mmm_f32_8x6_cortexa7 9 4 6 0.000001801617764943881 +armv7neon_mmm_f32_8x4_generic 25 32 13 0.00002352372957802219 +armv7neon_mmm_f32_8x4_generic 24 128 13 0.00005496909042425587 +armv7neon_mmm_f32_8x6_cortexa9 23 32 17 0.000018875847839557392 +generic_f32_4x4 12 32 4 0.000003926824640983302 +armv7neon_mmm_f32_8x6_cortexa9 7 128 18 0.000020631769324589798 +armv7neon_mmm_f32_8x6_cortexa9 8 128 19 0.000026726033464302463 +armv7neon_mmm_f32_8x6_cortexa7 7 4 5 0.0000012346138819052426 +armv7neon_mmm_f32_8x4_generic 17 128 7 0.00002816390176070489 +armv7neon_mmm_f32_8x6_generic 9 32 19 0.000015684300494904117 +armv7neon_mmm_f32_8x4_cortexa7 9 128 4 0.000010001875449763685 +armv7neon_mmm_f32_8x4_generic 23 32 13 0.000018251850356188044 +generic_f32_4x4 13 4 3 0.000003021647059953821 +armv7neon_mmm_f32_8x6_cortexa9 25 128 7 0.000053220906812986594 +armv7neon_mmm_f32_8x4_cortexa9 16 128 8 0.00001967608213206112 +armv7neon_mmm_f32_8x6_generic 24 32 5 0.000006620165371496042 +armv7neon_mmm_f32_8x4_cortexa7 9 4 7 0.0000028674611086712135 +armv7neon_mmm_f32_8x4_cortexa9 17 4 11 0.000005653151614630126 +generic_f32_4x4 13 4 4 0.000002594385628155042 +armv7neon_mmm_f32_8x6_generic 25 128 5 0.000026094659734105824 +armv7neon_mmm_f32_8x4_generic 17 32 3 0.000005161144602046728 +armv7neon_mmm_f32_8x4_cortexa9 24 32 11 0.000014220571969428708 +generic_f32_4x4 4 128 3 0.000004015357652724343 +armv7neon_mmm_f32_8x4_cortexa9 8 128 5 0.00001027176965651363 +armv7neon_mmm_f32_8x6_cortexa7 15 4 17 0.000004874430261781061 +armv7neon_mmm_f32_8x4_cortexa7 15 4 12 0.0000040169005856724945 +armv7neon_mmm_f32_8x4_cortexa7 8 32 12 0.00000477050855780904 +armv7neon_mmm_f32_8x6_cortexa7 9 4 7 0.0000031516329281183407 +armv7neon_mmm_f32_8x4_generic 7 32 11 0.000005308351521432454 +armv7neon_mmm_f32_8x4_cortexa7 25 32 5 0.000012670365676368953 +generic_f32_4x4 5 128 5 0.000014294465011666321 +armv7neon_mmm_f32_8x6_cortexa7 7 32 17 0.000006818696484359277 +armv7neon_mmm_f32_8x4_cortexa9 24 32 7 0.00000986925847989351 +generic_f32_4x4 4 128 13 0.000014106152963483687 +armv7neon_mmm_f32_8x4_cortexa9 16 128 3 0.000010512285712675924 +generic_f32_4x4 8 128 4 0.000007240602279573043 +armv7neon_mmm_f32_8x4_cortexa9 7 128 9 0.00001548686209510202 +armv7neon_mmm_f32_8x4_cortexa9 7 32 3 0.000002138015694680355 +armv7neon_mmm_f32_8x4_generic 24 32 13 0.000017536918553280824 +armv7neon_mmm_f32_8x6_cortexa9 23 4 12 0.0000044455569065126315 +armv7neon_mmm_f32_8x4_cortexa7 8 4 13 0.0000026000896469710663 +armv7neon_mmm_f32_8x4_cortexa7 25 32 8 0.000012179917551949525 +generic_f32_4x4 5 32 5 0.000005390504735926875 +generic_f32_4x4 11 128 13 0.00004175553018581371 +armv7neon_mmm_f32_8x4_cortexa7 7 32 4 0.0000021521595449855865 +armv7neon_mmm_f32_8x4_cortexa9 8 4 9 0.000002148291671877107 +armv7neon_mmm_f32_8x6_cortexa7 24 4 18 0.000005654885858199984 +armv7neon_mmm_f32_8x4_generic 25 128 9 0.00005533756635099798 +armv7neon_mmm_f32_8x4_cortexa9 9 128 9 0.00002978120797293901 +armv7neon_mmm_f32_8x6_cortexa7 8 32 12 0.000004281164608278993 +armv7neon_mmm_f32_8x4_generic 9 4 13 0.000004935869563311462 +armv7neon_mmm_f32_8x4_cortexa7 23 128 3 0.000015184873420309865 +generic_f32_4x4 12 4 12 0.000004912969841843077 +armv7neon_mmm_f32_8x6_generic 16 4 7 0.0000030460066806302122 +armv7neon_mmm_f32_8x4_cortexa7 9 4 13 0.000004992355989646887 +armv7neon_mmm_f32_8x6_cortexa9 15 4 12 0.0000032832021366886516 +armv7neon_mmm_f32_8x6_cortexa9 17 4 7 0.000004416231510276644 +armv7neon_mmm_f32_8x4_generic 9 128 5 0.000018874961280340985 +armv7neon_mmm_f32_8x4_cortexa7 25 4 9 0.000007146809416343016 +armv7neon_mmm_f32_8x6_cortexa9 23 4 5 0.000002791403862004092 +generic_f32_4x4 3 128 11 0.000011052312539972672 +armv7neon_mmm_f32_8x4_cortexa7 16 32 13 0.000012274313788252517 +generic_f32_4x4 5 4 8 0.000002691374011509211 +armv7neon_mmm_f32_8x6_generic 8 4 12 0.0000016391318699699268 +generic_f32_4x4 8 4 12 0.0000034590196765442797 +armv7neon_mmm_f32_8x6_generic 17 128 18 0.00005685255235859907 +armv7neon_mmm_f32_8x6_cortexa9 17 128 6 0.00002017521356859541 +armv7neon_mmm_f32_8x6_generic 25 128 6 0.00002553797390337986 +armv7neon_mmm_f32_8x6_cortexa9 25 128 5 0.000027210481604244293 +armv7neon_mmm_f32_8x4_cortexa7 15 128 7 0.000019945561157727263 +armv7neon_mmm_f32_8x4_generic 7 4 11 0.000002550958711536068 +generic_f32_4x4 13 4 8 0.000004699429428545618 +armv7neon_mmm_f32_8x4_cortexa9 9 4 5 0.0000028517696924614755 +armv7neon_mmm_f32_8x4_cortexa9 8 4 3 0.0000012050980961045702 +armv7neon_mmm_f32_8x6_cortexa9 24 128 17 0.000059842842681725297 +armv7neon_mmm_f32_8x6_cortexa9 25 128 6 0.000026691322740044253 +generic_f32_4x4 9 32 3 0.000004333031052520466 +armv7neon_mmm_f32_8x6_cortexa7 9 32 11 0.000008502744568312225 +armv7neon_mmm_f32_8x6_cortexa9 16 32 11 0.000008643328393303817 +armv7neon_mmm_f32_8x6_cortexa9 16 4 11 0.00000327884229270897 +armv7neon_mmm_f32_8x4_generic 8 128 5 0.000009661930895932827 +armv7neon_mmm_f32_8x4_cortexa9 15 4 5 0.0000029989007242676373 +armv7neon_mmm_f32_8x4_cortexa7 23 32 11 0.000014477721812785248 +armv7neon_mmm_f32_8x6_cortexa7 9 4 13 0.000004411946245984795 +armv7neon_mmm_f32_8x6_generic 7 128 12 0.000013345094425498999 +armv7neon_mmm_f32_8x4_cortexa9 24 128 11 0.00004426398857587651 +armv7neon_mmm_f32_8x6_generic 25 128 7 0.00005086677641643256 +armv7neon_mmm_f32_8x6_generic 16 32 17 0.00001197546406643872 +generic_f32_4x4 4 32 13 0.000005196854104063007 +armv7neon_mmm_f32_8x4_cortexa7 16 4 7 0.000002852179426317928 +armv7neon_mmm_f32_8x6_cortexa7 23 4 7 0.000004598943528457281 +armv7neon_mmm_f32_8x6_generic 17 32 6 0.000006120980761955415 +armv7neon_mmm_f32_8x6_cortexa7 17 32 17 0.00001829474178461942 +generic_f32_4x4 7 4 12 0.000003918843035175344 +armv7neon_mmm_f32_8x6_generic 9 4 7 0.0000030530125579888694 +armv7neon_mmm_f32_8x6_cortexa9 23 4 13 0.000006537462226861253 +generic_f32_4x4 12 32 11 0.000011229611140019307 +armv7neon_mmm_f32_8x4_cortexa7 7 32 9 0.000005392896316953727 +armv7neon_mmm_f32_8x4_cortexa9 15 4 7 0.00000310030600994376 +armv7neon_mmm_f32_8x4_cortexa9 23 32 3 0.000005486033134258231 +armv7neon_mmm_f32_8x6_cortexa7 25 4 11 0.000005954018792233075 +armv7neon_mmm_f32_8x4_generic 23 128 8 0.000027951206051686114 +armv7neon_mmm_f32_8x6_generic 23 4 12 0.000004342351795004267 +armv7neon_mmm_f32_8x4_cortexa9 8 32 5 0.0000035943022440195 +armv7neon_mmm_f32_8x6_cortexa9 7 4 18 0.0000028075381670446466 +armv7neon_mmm_f32_8x4_generic 8 32 13 0.0000061830339053399435 +armv7neon_mmm_f32_8x6_generic 15 32 7 0.000008321528944284549 +armv7neon_mmm_f32_8x6_cortexa7 16 128 6 0.00001332317233161335 +armv7neon_mmm_f32_8x4_generic 9 4 12 0.000003689645307663438 +armv7neon_mmm_f32_8x4_cortexa9 16 32 7 0.000006751619359417928 +generic_f32_4x4 7 4 4 0.0000016418721244643219 +armv7neon_mmm_f32_8x4_generic 7 128 13 0.000019287387814897854 +armv7neon_mmm_f32_8x6_cortexa9 23 4 11 0.000004831555332090134 +armv7neon_mmm_f32_8x4_cortexa9 8 128 8 0.00001009639921913338 +armv7neon_mmm_f32_8x6_generic 23 32 5 0.000006582337877689147 +armv7neon_mmm_f32_8x4_generic 25 128 3 0.00001920626106575073 +armv7neon_mmm_f32_8x4_generic 7 32 8 0.000003711210711678028 +armv7neon_mmm_f32_8x6_cortexa7 16 128 19 0.00005218314273770285 +armv7neon_mmm_f32_8x6_cortexa9 7 128 12 0.000013936034942170719 +armv7neon_mmm_f32_8x6_generic 17 32 5 0.000006487710283772798 +armv7neon_mmm_f32_8x4_generic 8 128 7 0.000009698770362126403 +armv7neon_mmm_f32_8x4_generic 15 4 7 0.0000030411777158900867 +armv7neon_mmm_f32_8x6_generic 25 128 19 0.00010155326542007438 +armv7neon_mmm_f32_8x6_cortexa7 9 128 5 0.000013650595324888383 +armv7neon_mmm_f32_8x4_cortexa9 9 32 11 0.00000982080901853953 +generic_f32_4x4 9 32 11 0.00001141647780993558 +generic_f32_4x4 8 32 11 0.000007662846153151327 +armv7neon_mmm_f32_8x6_generic 8 128 17 0.000019396378758926843 +armv7neon_mmm_f32_8x6_cortexa9 8 4 17 0.00000248907607573393 +generic_f32_4x4 13 128 12 0.000041173556961397094 +armv7neon_mmm_f32_8x4_generic 25 32 3 0.0000067405543706379265 +armv7neon_mmm_f32_8x6_cortexa9 16 4 13 0.000004284450398155638 +armv7neon_mmm_f32_8x6_generic 7 32 13 0.000006564286527582003 +generic_f32_4x4 9 4 12 0.000005239644799087451 +generic_f32_4x4 3 128 12 0.00001105053431182824 +armv7neon_mmm_f32_8x4_cortexa9 16 32 12 0.000009204841041049037 +armv7neon_mmm_f32_8x4_cortexa9 8 4 7 0.0000016950881959591487 +armv7neon_mmm_f32_8x4_cortexa9 8 4 8 0.00000148340332010083 +armv7neon_mmm_f32_8x4_cortexa9 24 32 4 0.000004874669124633379 +armv7neon_mmm_f32_8x4_cortexa9 16 128 7 0.00002009832072993214 +armv7neon_mmm_f32_8x4_generic 16 4 5 0.000002739180418018979 +armv7neon_mmm_f32_8x4_cortexa9 8 4 12 0.000001953874992361998 +armv7neon_mmm_f32_8x4_generic 15 4 4 0.0000016638026379811712 +armv7neon_mmm_f32_8x4_cortexa9 25 128 8 0.000039087913140379676 +armv7neon_mmm_f32_8x6_cortexa9 17 4 12 0.000004212564422068874 +armv7neon_mmm_f32_8x6_cortexa9 8 32 5 0.00000263061064210809 +armv7neon_mmm_f32_8x4_generic 8 4 5 0.000001623245576042727 +armv7neon_mmm_f32_8x6_cortexa7 15 128 6 0.000013618693064061564 +armv7neon_mmm_f32_8x6_cortexa7 7 4 18 0.000002804843066747126 +generic_f32_4x4 3 4 11 0.0000023913102926543824 +armv7neon_mmm_f32_8x6_generic 16 4 6 0.000001638807649221633 +armv7neon_mmm_f32_8x6_generic 8 128 6 0.0000067051548997103826 +armv7neon_mmm_f32_8x6_cortexa9 16 4 6 0.0000016568160988445508 +armv7neon_mmm_f32_8x6_cortexa9 16 128 7 0.000026851544464649677 +armv7neon_mmm_f32_8x4_cortexa9 8 128 12 0.000014872272678272856 +armv7neon_mmm_f32_8x6_generic 24 128 19 0.00007580588220559739 +generic_f32_4x4 3 128 5 0.0000074971724012246544 +generic_f32_4x4 3 128 8 0.000007539035525450043 +armv7neon_mmm_f32_8x6_cortexa9 24 128 7 0.00004003350952160301 +armv7neon_mmm_f32_8x4_cortexa7 23 32 9 0.000014356067971382847 +armv7neon_mmm_f32_8x6_cortexa9 15 32 18 0.000012709917183710011 +generic_f32_4x4 9 128 3 0.000011011983185480526 +armv7neon_mmm_f32_8x4_generic 15 128 7 0.000019154850070723174 +generic_f32_4x4 7 32 12 0.000007829205768353918 +generic_f32_4x4 12 4 3 0.000002428328614613921 +armv7neon_mmm_f32_8x4_cortexa7 8 128 4 0.000005191807529366239 +armv7neon_mmm_f32_8x4_cortexa9 17 128 3 0.00001540571905602067 +armv7neon_mmm_f32_8x4_cortexa9 17 128 9 0.000044358855138628255 +armv7neon_mmm_f32_8x4_generic 8 32 4 0.0000018712794095957982 +armv7neon_mmm_f32_8x6_cortexa9 9 128 7 0.000026855539948208353 +armv7neon_mmm_f32_8x6_cortexa7 7 128 13 0.000020286298173428065 +armv7neon_mmm_f32_8x4_cortexa7 7 32 7 0.0000037916453247576294 +armv7neon_mmm_f32_8x4_cortexa7 16 128 11 0.000029071525948432538 +armv7neon_mmm_f32_8x4_generic 8 128 8 0.000009479495707619602 +armv7neon_mmm_f32_8x4_cortexa7 9 32 13 0.000012575930597830614 +armv7neon_mmm_f32_8x6_cortexa7 7 4 7 0.0000019563323222872665 +armv7neon_mmm_f32_8x4_cortexa7 15 128 3 0.000010282742767020747 +armv7neon_mmm_f32_8x4_generic 17 128 9 0.00004166868490626211 +armv7neon_mmm_f32_8x6_cortexa9 15 32 11 0.000008848994549844927 +armv7neon_mmm_f32_8x4_cortexa7 16 128 7 0.000019694781274526964 +armv7neon_mmm_f32_8x6_cortexa9 7 128 17 0.00002067047797610865 +armv7neon_mmm_f32_8x4_cortexa7 16 4 8 0.000002421339765366324 +armv7neon_mmm_f32_8x6_cortexa9 25 4 19 0.000010458910160208884 +armv7neon_mmm_f32_8x4_cortexa7 17 4 12 0.000005174031688170145 +armv7neon_mmm_f32_8x4_cortexa7 8 128 7 0.000010096523777664552 +armv7neon_mmm_f32_8x4_cortexa7 9 32 11 0.000009664173576551376 +armv7neon_mmm_f32_8x4_cortexa9 15 32 13 0.000013172771066560177 +armv7neon_mmm_f32_8x6_cortexa7 15 32 6 0.0000045760506363113526 +armv7neon_mmm_f32_8x4_generic 16 128 7 0.000018904285841254675 +armv7neon_mmm_f32_8x6_cortexa9 24 32 7 0.000012425961839460377 +armv7neon_mmm_f32_8x4_cortexa7 8 128 8 0.00000988276098193371 +armv7neon_mmm_f32_8x4_cortexa7 25 32 13 0.000024332756642022888 +armv7neon_mmm_f32_8x6_cortexa9 8 4 12 0.000001657247635058824 +armv7neon_mmm_f32_8x4_cortexa7 8 4 9 0.000002131892294337394 +generic_f32_4x4 8 4 8 0.000002499295427359815 +armv7neon_mmm_f32_8x6_cortexa9 16 128 5 0.000013927438070206447 +generic_f32_4x4 3 4 7 0.0000017546173861422715 +armv7neon_mmm_f32_8x4_cortexa7 23 128 12 0.000043445426131070976 +armv7neon_mmm_f32_8x4_generic 8 128 12 0.00001395887153959496 +armv7neon_mmm_f32_8x4_cortexa9 7 4 11 0.0000025905119707065483 +generic_f32_4x4 12 128 4 0.00001060459652081298 +armv7neon_mmm_f32_8x6_generic 17 128 13 0.00005710733590612565 +armv7neon_mmm_f32_8x4_cortexa7 7 4 11 0.000002572252868831556 +armv7neon_mmm_f32_8x6_generic 15 128 11 0.000026019339889436572 +generic_f32_4x4 9 4 5 0.000003906378066863749 +armv7neon_mmm_f32_8x6_cortexa9 15 4 17 0.00000485956993190203 +armv7neon_mmm_f32_8x6_generic 9 32 17 0.000012038137584352969 +armv7neon_mmm_f32_8x4_cortexa9 8 32 11 0.000005081279026778461 +armv7neon_mmm_f32_8x4_cortexa7 25 128 11 0.000057856501014562726 +armv7neon_mmm_f32_8x6_cortexa9 25 32 19 0.00003186559880573022 +armv7neon_mmm_f32_8x6_cortexa9 8 32 12 0.000004319717163614961 +armv7neon_mmm_f32_8x6_cortexa9 8 32 19 0.000008329624592686573 +armv7neon_mmm_f32_8x6_cortexa9 9 32 13 0.000012409078932821099 +armv7neon_mmm_f32_8x6_cortexa9 17 128 12 0.00003982138220281054 +armv7neon_mmm_f32_8x4_cortexa9 8 32 4 0.0000019683045582115826 +armv7neon_mmm_f32_8x6_cortexa9 17 128 13 0.000059714513355137016 +armv7neon_mmm_f32_8x6_cortexa9 25 32 13 0.00002415767938328582 +armv7neon_mmm_f32_8x4_cortexa7 7 128 8 0.000010344849965132014 +armv7neon_mmm_f32_8x6_cortexa9 8 128 11 0.000013785904590501993 +armv7neon_mmm_f32_8x4_cortexa7 24 32 13 0.000018144082756384124 +armv7neon_mmm_f32_8x6_generic 15 4 13 0.000004567420998191548 +armv7neon_mmm_f32_8x4_generic 9 32 13 0.000012173282994755266 +generic_f32_4x4 4 4 4 0.0000009923563076021443 +armv7neon_mmm_f32_8x4_cortexa9 24 128 7 0.000029905631027460932 +armv7neon_mmm_f32_8x4_cortexa7 9 128 8 0.0000195010462183809 +armv7neon_mmm_f32_8x6_cortexa9 9 4 5 0.000001915573042137183 +generic_f32_4x4 7 128 12 0.000021189012747949428 +armv7neon_mmm_f32_8x4_generic 23 32 9 0.000013896926880805095 +armv7neon_mmm_f32_8x4_generic 8 128 13 0.000018647991490366943 +armv7neon_mmm_f32_8x4_cortexa7 15 32 11 0.00000998072291720506 +generic_f32_4x4 3 128 4 0.0000040235161328678346 +armv7neon_mmm_f32_8x4_generic 9 128 3 0.000009799742651068634 +armv7neon_mmm_f32_8x6_generic 25 4 11 0.000005815747914439624 +armv7neon_mmm_f32_8x4_cortexa9 9 4 12 0.0000037593832702220446 +armv7neon_mmm_f32_8x4_generic 8 32 7 0.0000034630101008340834 +armv7neon_mmm_f32_8x4_cortexa9 24 4 7 0.000004049582371373125 +armv7neon_mmm_f32_8x6_generic 9 32 12 0.000008052850558834548 +armv7neon_mmm_f32_8x4_cortexa7 25 4 3 0.000003139298464286286 +armv7neon_mmm_f32_8x4_generic 15 4 3 0.0000018347526374757481 +armv7neon_mmm_f32_8x6_cortexa9 25 4 18 0.000007705086964181997 +generic_f32_4x4 5 128 13 0.00002796513369706675 +armv7neon_mmm_f32_8x4_cortexa9 23 4 12 0.0000055210994766811515 +generic_f32_4x4 7 128 4 0.000007400928983118127 +generic_f32_4x4 12 128 8 0.000020684245069264504 +armv7neon_mmm_f32_8x4_generic 9 4 7 0.0000028402738019775784 +generic_f32_4x4 13 4 9 0.000007094831203377165 +armv7neon_mmm_f32_8x4_cortexa9 17 4 4 0.000002109614550782161 +generic_f32_4x4 9 32 7 0.00000788121798038347 +armv7neon_mmm_f32_8x4_cortexa9 17 32 9 0.000014308883462633102 +armv7neon_mmm_f32_8x4_generic 16 128 12 0.00002742841574783841 +armv7neon_mmm_f32_8x6_generic 9 32 13 0.000011930679710766873 +armv7neon_mmm_f32_8x4_cortexa7 24 128 7 0.000029306857678894273 +armv7neon_mmm_f32_8x4_cortexa9 15 4 11 0.0000043086270265983256 +generic_f32_4x4 11 4 11 0.000005690020384668188 +generic_f32_4x4 11 4 5 0.000003971103754939711 +armv7neon_mmm_f32_8x4_generic 23 128 7 0.000028363425957340315 +armv7neon_mmm_f32_8x4_cortexa7 25 32 7 0.000012820527346969179 +armv7neon_mmm_f32_8x6_generic 17 4 13 0.0000061204912028192775 +armv7neon_mmm_f32_8x4_cortexa7 23 4 13 0.000007468189054284558 +generic_f32_4x4 4 32 7 0.0000029448449923973884 +armv7neon_mmm_f32_8x4_cortexa9 25 32 12 0.000018265077104449187 +armv7neon_mmm_f32_8x6_cortexa9 17 32 5 0.000006709520484710961 +armv7neon_mmm_f32_8x4_cortexa9 23 128 7 0.00003016688393762355 +armv7neon_mmm_f32_8x4_cortexa9 16 128 13 0.000039189527740613905 +armv7neon_mmm_f32_8x4_cortexa7 16 128 5 0.000019619696312768993 +armv7neon_mmm_f32_8x4_generic 17 128 13 0.00005528191915660002 +armv7neon_mmm_f32_8x6_generic 17 128 19 0.00007601141239533076 +armv7neon_mmm_f32_8x4_generic 8 32 5 0.0000034293993184402013 +generic_f32_4x4 9 32 4 0.000004039130783975973 +armv7neon_mmm_f32_8x4_cortexa9 23 32 9 0.000014579239774429512 +armv7neon_mmm_f32_8x6_cortexa7 8 128 11 0.000013612858256767989 +armv7neon_mmm_f32_8x6_cortexa9 15 32 12 0.000008658441674303882 +armv7neon_mmm_f32_8x4_generic 9 128 13 0.00003711806630993475 +armv7neon_mmm_f32_8x6_generic 17 4 11 0.000004486220214719747 +armv7neon_mmm_f32_8x6_generic 24 32 17 0.00001769011948694583 +armv7neon_mmm_f32_8x6_generic 9 4 18 0.00000416889219962654 +armv7neon_mmm_f32_8x6_cortexa7 23 32 19 0.000024297125708617196 +armv7neon_mmm_f32_8x6_cortexa9 9 32 19 0.000016308328501596863 +generic_f32_4x4 5 32 13 0.000010150287138446791 +generic_f32_4x4 11 128 8 0.000021041592663498075 +armv7neon_mmm_f32_8x4_generic 17 32 8 0.000009030738946480375 +armv7neon_mmm_f32_8x4_generic 25 4 5 0.0000050194271140758065 +armv7neon_mmm_f32_8x4_generic 8 128 11 0.000014188042949903446 +generic_f32_4x4 13 32 11 0.000014981513303829195 +armv7neon_mmm_f32_8x4_cortexa9 8 128 4 0.000005303802744401548 +armv7neon_mmm_f32_8x6_cortexa7 17 4 5 0.0000027556390651666856 +armv7neon_mmm_f32_8x6_generic 25 128 12 0.000050521454291978235 +armv7neon_mmm_f32_8x6_cortexa9 16 4 7 0.0000031344346513242154 +armv7neon_mmm_f32_8x6_cortexa7 16 32 5 0.000004750660751815082 +armv7neon_mmm_f32_8x4_cortexa9 17 128 8 0.000029514324154126036 +armv7neon_mmm_f32_8x4_cortexa7 8 4 8 0.000001465122696586523 +armv7neon_mmm_f32_8x6_cortexa9 8 128 17 0.00002029296708644369 +armv7neon_mmm_f32_8x4_generic 25 32 12 0.00001736483445253461 +generic_f32_4x4 3 32 5 0.0000030443235090952762 +generic_f32_4x4 7 128 11 0.000021331736800406107 +armv7neon_mmm_f32_8x4_generic 24 128 3 0.000014605163771206865 +armv7neon_mmm_f32_8x4_cortexa9 15 32 8 0.000006785141126690698 +armv7neon_mmm_f32_8x6_cortexa7 23 4 11 0.000004845198031614994 +armv7neon_mmm_f32_8x6_cortexa7 8 4 18 0.0000022246057256413265 +armv7neon_mmm_f32_8x4_generic 23 32 12 0.000013609481500575639 +armv7neon_mmm_f32_8x6_generic 7 128 17 0.00001990485288299381 +armv7neon_mmm_f32_8x6_cortexa7 25 128 19 0.00010434796032564001 +armv7neon_mmm_f32_8x6_generic 15 4 7 0.0000032121848403447733 +armv7neon_mmm_f32_8x4_generic 25 32 9 0.00001793284879597201 +armv7neon_mmm_f32_8x4_cortexa7 25 4 5 0.0000050795046947410985 +armv7neon_mmm_f32_8x4_cortexa7 17 32 12 0.000013704665363244266 +generic_f32_4x4 5 32 3 0.0000030334097589492826 +armv7neon_mmm_f32_8x4_cortexa9 17 128 5 0.000030021544003020483 +armv7neon_mmm_f32_8x4_cortexa7 24 4 9 0.000005360352425382144 +armv7neon_mmm_f32_8x6_cortexa7 9 32 18 0.000012173297962511017 +armv7neon_mmm_f32_8x4_cortexa7 15 4 13 0.000005358787865343933 +armv7neon_mmm_f32_8x6_cortexa9 17 32 11 0.000012635537581820717 +armv7neon_mmm_f32_8x4_generic 9 32 12 0.000009105324476823614 +armv7neon_mmm_f32_8x4_cortexa7 17 128 11 0.00004382978043849 +armv7neon_mmm_f32_8x6_generic 17 4 6 0.0000023082534746308964 +armv7neon_mmm_f32_8x6_cortexa7 8 128 17 0.000020059146620211786 +armv7neon_mmm_f32_8x4_cortexa7 15 4 7 0.000003068689058039639 +armv7neon_mmm_f32_8x4_cortexa9 15 4 13 0.000005392948403824124 +armv7neon_mmm_f32_8x4_cortexa7 15 4 8 0.0000028443546860500008 +armv7neon_mmm_f32_8x4_cortexa9 24 4 3 0.000002583780394094933 +generic_f32_4x4 13 4 11 0.0000072216668884748755 +armv7neon_mmm_f32_8x6_cortexa7 25 4 13 0.00000811613473861232 +armv7neon_mmm_f32_8x6_generic 7 32 18 0.000006623134760255213 +armv7neon_mmm_f32_8x4_cortexa7 16 4 4 0.0000014656631347323804 +armv7neon_mmm_f32_8x6_cortexa9 24 128 12 0.00003980745750816077 +armv7neon_mmm_f32_8x6_cortexa9 17 32 6 0.00000638262644761529 +generic_f32_4x4 8 128 12 0.000020674001160178604 +armv7neon_mmm_f32_8x6_cortexa9 9 128 5 0.000013783853077105059 +armv7neon_mmm_f32_8x4_cortexa7 23 4 11 0.00000590971284549428 +armv7neon_mmm_f32_8x6_generic 17 128 7 0.00003951740163826464 +armv7neon_mmm_f32_8x6_cortexa7 23 128 12 0.00003972987011474437 +generic_f32_4x4 11 32 9 0.000011454015846057979 +armv7neon_mmm_f32_8x4_generic 7 128 8 0.000009944618306958537 +armv7neon_mmm_f32_8x4_cortexa7 9 128 12 0.00002904338895016975 +generic_f32_4x4 7 32 3 0.000003062772103932835 +armv7neon_mmm_f32_8x6_generic 7 32 19 0.000008594590275392258 +armv7neon_mmm_f32_8x4_cortexa9 24 4 4 0.0000019652194228578485 +armv7neon_mmm_f32_8x6_generic 25 32 17 0.00002354000930758944 +armv7neon_mmm_f32_8x4_cortexa7 24 4 12 0.000004776517171804484 +generic_f32_4x4 9 128 12 0.00003113614108946529 +armv7neon_mmm_f32_8x4_generic 25 4 13 0.000009058320096239628 +armv7neon_mmm_f32_8x4_cortexa9 15 32 7 0.000007013841485260632 +armv7neon_mmm_f32_8x6_cortexa7 8 32 13 0.000006373539570632082 +armv7neon_mmm_f32_8x6_cortexa9 7 128 7 0.000013849670236184067 +armv7neon_mmm_f32_8x6_cortexa7 16 4 5 0.0000021140389285040373 +armv7neon_mmm_f32_8x6_cortexa7 15 4 19 0.000006018452488388481 +armv7neon_mmm_f32_8x4_cortexa9 25 4 11 0.000007297177417716392 +armv7neon_mmm_f32_8x4_cortexa7 7 128 3 0.0000053704412970208095 +armv7neon_mmm_f32_8x6_cortexa7 25 32 6 0.000008216173586447742 +armv7neon_mmm_f32_8x6_cortexa9 23 32 6 0.000006530580171598388 +armv7neon_mmm_f32_8x6_cortexa9 8 4 11 0.0000019099336489036645 +armv7neon_mmm_f32_8x4_cortexa7 8 4 3 0.0000011915557137944658 +armv7neon_mmm_f32_8x4_cortexa9 7 32 13 0.000007135759350139172 +armv7neon_mmm_f32_8x4_cortexa9 15 128 8 0.000020152163526097688 +armv7neon_mmm_f32_8x6_generic 16 128 12 0.000025531348127989936 +armv7neon_mmm_f32_8x4_cortexa9 9 32 8 0.00000655179781932232 +armv7neon_mmm_f32_8x4_generic 25 4 7 0.000005150952172982353 +armv7neon_mmm_f32_8x6_generic 23 128 7 0.00003975509670222853 +generic_f32_4x4 7 128 7 0.000014417564018686427 +armv7neon_mmm_f32_8x6_generic 7 32 6 0.0000025435007870682203 +generic_f32_4x4 12 32 13 0.000014516218594466092 +armv7neon_mmm_f32_8x6_generic 24 4 13 0.000005989131584497686 +armv7neon_mmm_f32_8x6_cortexa7 24 4 13 0.000006130318576111218 +armv7neon_mmm_f32_8x4_cortexa7 24 4 8 0.0000033735678137031244 +armv7neon_mmm_f32_8x6_cortexa9 15 32 6 0.000004614375157776428 +armv7neon_mmm_f32_8x4_cortexa7 17 128 5 0.000029428735157155848 +generic_f32_4x4 11 32 3 0.000004361903012775443 +armv7neon_mmm_f32_8x6_cortexa9 24 128 11 0.00004051475238084589 +armv7neon_mmm_f32_8x6_cortexa7 8 4 11 0.0000019289152314594922 +armv7neon_mmm_f32_8x4_generic 7 32 12 0.00000529445196979877 +armv7neon_mmm_f32_8x4_cortexa9 25 4 9 0.0000071851687447521495 +armv7neon_mmm_f32_8x4_cortexa9 7 128 13 0.0000205036178463644 +armv7neon_mmm_f32_8x4_cortexa7 25 128 4 0.00001951695951825782 +generic_f32_4x4 11 32 11 0.0000115439545889861 +armv7neon_mmm_f32_8x6_cortexa7 23 32 7 0.000012514048765991971 +generic_f32_4x4 13 32 8 0.000009849544291047146 +armv7neon_mmm_f32_8x4_cortexa9 24 4 13 0.0000068036126797526975 +armv7neon_mmm_f32_8x6_cortexa9 23 128 7 0.00004044892329500272 +armv7neon_mmm_f32_8x4_cortexa9 24 32 9 0.000014128824207386303 +armv7neon_mmm_f32_8x6_cortexa9 15 4 19 0.000006004743734908054 +armv7neon_mmm_f32_8x6_generic 16 128 7 0.000025803175203093092 +armv7neon_mmm_f32_8x4_generic 24 4 9 0.000005289440520085159 +armv7neon_mmm_f32_8x4_generic 7 128 7 0.000009924053412179527 +armv7neon_mmm_f32_8x6_cortexa7 15 4 5 0.0000020760458197512403 +armv7neon_mmm_f32_8x4_cortexa9 23 32 12 0.000014302742013508701 +armv7neon_mmm_f32_8x6_cortexa9 23 128 18 0.00006018418795895432 +armv7neon_mmm_f32_8x4_cortexa7 7 128 12 0.000015239535080520215 +armv7neon_mmm_f32_8x6_generic 15 4 6 0.0000018544842241934815 +armv7neon_mmm_f32_8x6_cortexa7 25 32 18 0.000023497641993495917 +generic_f32_4x4 9 4 8 0.0000037065692747303202 +armv7neon_mmm_f32_8x6_generic 9 4 17 0.0000044060023407811705 +armv7neon_mmm_f32_8x4_generic 17 4 7 0.000004004011518049962 +armv7neon_mmm_f32_8x4_generic 16 4 9 0.0000037009767742197034 +armv7neon_mmm_f32_8x4_generic 16 4 11 0.0000037573621028760724 +armv7neon_mmm_f32_8x6_cortexa9 23 4 18 0.000006335024379033378 +generic_f32_4x4 7 32 11 0.00000797109252276388 +generic_f32_4x4 8 32 8 0.00000506136423984456 +armv7neon_mmm_f32_8x6_cortexa9 16 32 18 0.000011962958224342696 +armv7neon_mmm_f32_8x4_generic 9 32 3 0.0000035695396503977124 +armv7neon_mmm_f32_8x4_cortexa7 15 128 9 0.000029527917733416504 +armv7neon_mmm_f32_8x4_cortexa7 16 32 3 0.0000037787003081727403 +armv7neon_mmm_f32_8x4_cortexa9 17 4 5 0.000003983936158752065 +generic_f32_4x4 5 4 7 0.0000028427432000736736 +armv7neon_mmm_f32_8x6_generic 24 128 6 0.000019285772023622517 +armv7neon_mmm_f32_8x6_cortexa9 24 128 6 0.000020079389293074595 +armv7neon_mmm_f32_8x4_cortexa7 24 128 11 0.000043650740679119 +armv7neon_mmm_f32_8x6_cortexa7 15 4 6 0.0000019214162046321274 +armv7neon_mmm_f32_8x6_generic 24 128 11 0.000039928070800596723 +armv7neon_mmm_f32_8x6_cortexa7 17 128 13 0.00005915137603637775 +armv7neon_mmm_f32_8x6_cortexa7 9 128 12 0.00002647047584440748 +generic_f32_4x4 8 32 7 0.000005396919284647475 +armv7neon_mmm_f32_8x4_cortexa7 16 32 7 0.000006638663857963081 +armv7neon_mmm_f32_8x4_cortexa7 9 128 9 0.000029246794882198228 +armv7neon_mmm_f32_8x4_generic 15 32 12 0.000009467451463683956 +armv7neon_mmm_f32_8x6_generic 23 128 6 0.00001954577461053118 +armv7neon_mmm_f32_8x4_cortexa9 7 32 7 0.000003851842872992441 +armv7neon_mmm_f32_8x6_cortexa9 16 32 6 0.000004321654529645647 +generic_f32_4x4 13 32 4 0.00000517924119222049 +armv7neon_mmm_f32_8x4_generic 8 4 9 0.000002104797317503441 +armv7neon_mmm_f32_8x4_cortexa7 24 128 3 0.000015233751299574878 +armv7neon_mmm_f32_8x6_cortexa9 16 128 18 0.000039798911987149584 +armv7neon_mmm_f32_8x6_cortexa7 23 128 13 0.00005946225660703637 +generic_f32_4x4 7 32 5 0.000005462565009156774 +armv7neon_mmm_f32_8x6_generic 8 128 19 0.000025872976344928823 +generic_f32_4x4 5 4 12 0.0000037781682890084256 +armv7neon_mmm_f32_8x4_cortexa9 7 4 13 0.000003202055925058074 +armv7neon_mmm_f32_8x6_cortexa9 8 32 13 0.000006422608725593242 +armv7neon_mmm_f32_8x6_generic 9 128 7 0.000025813127546183672 +armv7neon_mmm_f32_8x4_generic 23 128 3 0.000014644120134860135 +armv7neon_mmm_f32_8x4_generic 8 4 3 0.0000011807711987459403 +armv7neon_mmm_f32_8x6_cortexa7 16 128 18 0.00003920283874886212 +armv7neon_mmm_f32_8x4_cortexa9 15 32 3 0.0000038264927174656245 +armv7neon_mmm_f32_8x6_cortexa7 25 32 12 0.000015882492018463845 +armv7neon_mmm_f32_8x6_generic 15 128 13 0.00003967007811496026 +armv7neon_mmm_f32_8x6_generic 25 4 12 0.00000523795399145237 +armv7neon_mmm_f32_8x4_cortexa9 16 4 7 0.0000028765969548151297 +armv7neon_mmm_f32_8x4_cortexa7 7 4 5 0.0000018096325702613645 +armv7neon_mmm_f32_8x6_generic 25 4 19 0.00001025642311454773 +armv7neon_mmm_f32_8x6_generic 9 128 6 0.000013045121912181557 +armv7neon_mmm_f32_8x6_cortexa7 15 32 11 0.000008770690105208661 +armv7neon_mmm_f32_8x6_generic 23 128 17 0.00005984628206960245 +armv7neon_mmm_f32_8x4_cortexa9 8 32 9 0.000005054839401749921 +generic_f32_4x4 9 32 13 0.000014812319931001716 +armv7neon_mmm_f32_8x4_cortexa9 8 32 7 0.0000036300075704923473 +generic_f32_4x4 12 128 12 0.00003080051055479753 +armv7neon_mmm_f32_8x4_generic 9 4 4 0.0000015775072922596973 +generic_f32_4x4 11 128 11 0.00003163786674887252 +generic_f32_4x4 11 128 7 0.000021315035021180117 +armv7neon_mmm_f32_8x4_generic 24 128 11 0.00004302931660486898 +armv7neon_mmm_f32_8x6_cortexa7 7 128 7 0.000013660184612749007 +armv7neon_mmm_f32_8x6_generic 15 4 12 0.0000032030364372184837 +armv7neon_mmm_f32_8x6_cortexa7 15 128 17 0.00004014988958130008 +armv7neon_mmm_f32_8x4_cortexa7 17 32 7 0.000009763008233938806 +armv7neon_mmm_f32_8x4_generic 23 4 11 0.000005850624187247901 +armv7neon_mmm_f32_8x6_cortexa9 25 4 7 0.000005691323284198507 +armv7neon_mmm_f32_8x6_generic 9 4 5 0.0000018995170310061328 +generic_f32_4x4 11 32 5 0.000007873582158405535 +armv7neon_mmm_f32_8x6_generic 17 4 7 0.0000043144070272553465 +armv7neon_mmm_f32_8x6_cortexa9 7 32 13 0.000006785328240517207 +armv7neon_mmm_f32_8x4_generic 25 128 12 0.00005645345647499795 +armv7neon_mmm_f32_8x4_generic 9 4 8 0.0000026364183937099707 +armv7neon_mmm_f32_8x4_generic 9 4 3 0.000001754783422972911 +armv7neon_mmm_f32_8x6_cortexa9 24 4 5 0.000002822627399800846 +armv7neon_mmm_f32_8x4_cortexa9 23 4 7 0.000004277460484355386 +armv7neon_mmm_f32_8x4_cortexa9 16 4 13 0.000004703052607837251 +armv7neon_mmm_f32_8x4_cortexa7 9 32 8 0.0000064472411584695615 +generic_f32_4x4 5 128 3 0.000007484402044593335 +armv7neon_mmm_f32_8x6_cortexa7 24 4 12 0.000003960884362652896 +armv7neon_mmm_f32_8x4_cortexa7 7 32 8 0.0000038146920144727004 +armv7neon_mmm_f32_8x4_cortexa9 24 4 12 0.000004810650577018378 +armv7neon_mmm_f32_8x4_cortexa9 9 4 13 0.000005031022350762627 +armv7neon_mmm_f32_8x6_cortexa9 9 128 12 0.000026841942459090627 +generic_f32_4x4 3 4 3 0.0000011084107564139803 +armv7neon_mmm_f32_8x4_generic 23 128 13 0.000057838678203389286 +armv7neon_mmm_f32_8x6_cortexa7 23 128 5 0.000020389937552256748 +armv7neon_mmm_f32_8x4_cortexa7 15 32 13 0.000012977109930123348 +armv7neon_mmm_f32_8x4_generic 16 32 11 0.00000919341672636989 +armv7neon_mmm_f32_8x6_cortexa9 17 128 17 0.000060215116898669664 +armv7neon_mmm_f32_8x4_generic 24 4 8 0.000003324206058446326 +armv7neon_mmm_f32_8x4_cortexa7 17 4 5 0.000003957357480955988 +armv7neon_mmm_f32_8x6_cortexa9 16 32 19 0.000016111976497239984 +generic_f32_4x4 13 4 12 0.000006710960240784156 +armv7neon_mmm_f32_8x6_cortexa9 23 128 5 0.000020639189093056077 +armv7neon_mmm_f32_8x6_generic 24 32 11 0.000012182073684360654 +armv7neon_mmm_f32_8x6_cortexa9 9 128 13 0.00004021781819635084 +armv7neon_mmm_f32_8x4_cortexa9 17 4 12 0.000005225946660532542 +armv7neon_mmm_f32_8x6_cortexa9 15 4 7 0.0000032840412749160575 +armv7neon_mmm_f32_8x4_cortexa9 17 128 13 0.000059220265497108164 +armv7neon_mmm_f32_8x6_cortexa9 7 4 11 0.0000020564874758047424 +generic_f32_4x4 8 128 8 0.000013969303780467269 +armv7neon_mmm_f32_8x4_cortexa7 25 4 12 0.000006595762756379242 +armv7neon_mmm_f32_8x6_generic 16 32 19 0.000015466876363304513 +armv7neon_mmm_f32_8x4_generic 24 4 11 0.0000053731939607204945 +generic_f32_4x4 8 128 13 0.00002774022280530327 +generic_f32_4x4 7 128 8 0.00001428610505820359 +armv7neon_mmm_f32_8x4_cortexa9 15 128 12 0.00002999051242337208 +armv7neon_mmm_f32_8x4_cortexa9 17 32 5 0.00000980239512919155 +armv7neon_mmm_f32_8x4_cortexa9 9 128 8 0.00001992992679699177 +armv7neon_mmm_f32_8x4_generic 25 32 5 0.00001226452231341447 +generic_f32_4x4 11 128 4 0.0000107871041093641 +generic_f32_4x4 13 32 5 0.000010185534520279402 +armv7neon_mmm_f32_8x6_cortexa7 9 32 13 0.000012306258740029597 +generic_f32_4x4 3 4 9 0.0000023591282065930476 +armv7neon_mmm_f32_8x4_cortexa7 15 128 12 0.000029408783696922103 +armv7neon_mmm_f32_8x6_cortexa9 25 4 5 0.000003460717936029046 +armv7neon_mmm_f32_8x6_cortexa9 15 32 7 0.000008659647810573247 +armv7neon_mmm_f32_8x6_generic 15 32 12 0.000008324450366911187 +armv7neon_mmm_f32_8x4_generic 7 4 4 0.0000011733671526145581 +armv7neon_mmm_f32_8x6_generic 8 32 17 0.000006245151171661497 +generic_f32_4x4 4 128 12 0.000010589994729591713 +generic_f32_4x4 13 32 3 0.000005623254183753802 +armv7neon_mmm_f32_8x4_cortexa9 17 32 8 0.000009489997139764281 +armv7neon_mmm_f32_8x6_cortexa7 15 128 12 0.0000267366663985427 +armv7neon_mmm_f32_8x6_generic 8 32 12 0.000004170620267459084 +generic_f32_4x4 12 128 13 0.00004142579938644744 +armv7neon_mmm_f32_8x6_cortexa7 7 4 17 0.0000028372537187841064 +armv7neon_mmm_f32_8x6_cortexa7 17 4 18 0.000005987713490844775 +armv7neon_mmm_f32_8x6_cortexa9 7 4 5 0.0000012358111649104604 +armv7neon_mmm_f32_8x4_generic 7 4 7 0.0000018542946008174326 +armv7neon_mmm_f32_8x4_generic 7 4 13 0.0000031498896105686864 +armv7neon_mmm_f32_8x6_cortexa9 15 4 11 0.0000034645213247919744 +armv7neon_mmm_f32_8x6_cortexa7 23 4 5 0.0000028436717084834024 +armv7neon_mmm_f32_8x4_generic 17 128 3 0.00001456481059501176 +armv7neon_mmm_f32_8x4_cortexa7 17 4 7 0.00000404574535805843 +armv7neon_mmm_f32_8x6_cortexa9 23 4 19 0.000008441880755451333 +armv7neon_mmm_f32_8x6_generic 16 4 5 0.000002037583190427157 +armv7neon_mmm_f32_8x4_cortexa7 17 4 11 0.000005618041346469936 +generic_f32_4x4 8 128 3 0.000007542706448510692 +armv7neon_mmm_f32_8x6_cortexa7 16 4 6 0.0000016599211327997808 +armv7neon_mmm_f32_8x4_generic 24 32 11 0.000013531694734026834 +armv7neon_mmm_f32_8x4_cortexa7 24 128 13 0.00005774273822898698 +generic_f32_4x4 12 32 8 0.000007328921605870887 +armv7neon_mmm_f32_8x4_generic 17 32 5 0.000009344539695515143 +armv7neon_mmm_f32_8x6_cortexa7 25 4 5 0.0000035354392629416224 +armv7neon_mmm_f32_8x4_generic 9 128 9 0.000028070243170981636 +armv7neon_mmm_f32_8x4_generic 7 32 7 0.0000036902878668462197 +generic_f32_4x4 4 32 5 0.000002913735845387932 +armv7neon_mmm_f32_8x4_cortexa7 23 128 9 0.00004401834026329482 +armv7neon_mmm_f32_8x6_cortexa7 15 32 17 0.000012793909984998035 +armv7neon_mmm_f32_8x4_cortexa7 24 32 11 0.000013985445748841752 +armv7neon_mmm_f32_8x4_generic 17 128 12 0.0000426589070381794 +armv7neon_mmm_f32_8x6_generic 9 128 19 0.000053237918402024005 +armv7neon_mmm_f32_8x6_cortexa7 17 4 17 0.000006443148021217296 +armv7neon_mmm_f32_8x4_cortexa7 8 128 12 0.000014559920878726009 +armv7neon_mmm_f32_8x6_cortexa9 8 4 19 0.000002972990505690349 +armv7neon_mmm_f32_8x6_cortexa7 25 128 5 0.000027009599886340098 +armv7neon_mmm_f32_8x6_cortexa7 7 32 5 0.0000025526809022378453 +generic_f32_4x4 13 128 13 0.00005568030788678991 +armv7neon_mmm_f32_8x4_cortexa7 15 32 12 0.000009774179275886764 +armv7neon_mmm_f32_8x4_generic 16 128 8 0.000018479196791574824 +armv7neon_mmm_f32_8x6_generic 25 128 17 0.00007786728690688477 +armv7neon_mmm_f32_8x6_generic 7 32 11 0.000004604319680609591 +armv7neon_mmm_f32_8x4_cortexa9 15 128 5 0.000020252737794258734 +armv7neon_mmm_f32_8x4_cortexa9 25 4 12 0.000006639922481547946 +armv7neon_mmm_f32_8x4_cortexa7 23 128 8 0.000029341008489406463 +armv7neon_mmm_f32_8x4_cortexa7 8 32 5 0.000003533774210652598 +generic_f32_4x4 8 128 7 0.000014303959630243704 +armv7neon_mmm_f32_8x4_cortexa9 25 32 5 0.000012867678837939383 +armv7neon_mmm_f32_8x6_generic 8 128 13 0.000019410057867203144 +armv7neon_mmm_f32_8x4_generic 15 4 5 0.0000029437961456768246 +generic_f32_4x4 12 4 9 0.000005303987106806706 +armv7neon_mmm_f32_8x6_generic 24 4 19 0.000007657205135344466 +armv7neon_mmm_f32_8x4_generic 9 4 11 0.000003929237868686891 +generic_f32_4x4 13 128 9 0.00004175491457338789 +armv7neon_mmm_f32_8x6_cortexa7 9 4 11 0.0000032396587513198363 +armv7neon_mmm_f32_8x6_cortexa7 9 32 5 0.000004609642726421324 +armv7neon_mmm_f32_8x6_cortexa7 8 128 13 0.000019980200207777093 +generic_f32_4x4 8 4 13 0.000004697441883300323 +armv7neon_mmm_f32_8x4_generic 24 128 5 0.000028404231306910303 +armv7neon_mmm_f32_8x4_cortexa7 15 128 8 0.00001974335359333113 +armv7neon_mmm_f32_8x4_generic 8 4 4 0.000000975840577396329 +armv7neon_mmm_f32_8x6_cortexa9 17 4 6 0.000002372991977511254 +armv7neon_mmm_f32_8x6_cortexa7 24 4 17 0.0000063580907262403335 +armv7neon_mmm_f32_8x6_cortexa9 24 4 17 0.00000634338841510274 +armv7neon_mmm_f32_8x6_cortexa7 8 128 19 0.000026457168307267402 +armv7neon_mmm_f32_8x4_cortexa7 15 32 4 0.0000035973058136569266 +armv7neon_mmm_f32_8x6_cortexa7 23 128 11 0.000040137380065831114 +armv7neon_mmm_f32_8x6_generic 8 32 11 0.0000044080486884835774 +armv7neon_mmm_f32_8x6_cortexa7 16 32 13 0.000012184909260827915 +armv7neon_mmm_f32_8x4_generic 24 128 12 0.00004230513222395041 +armv7neon_mmm_f32_8x6_generic 16 32 6 0.000004173665390794717 +armv7neon_mmm_f32_8x4_generic 15 128 5 0.000019049336126167508 +generic_f32_4x4 5 32 11 0.000007837409156830309 +armv7neon_mmm_f32_8x6_cortexa7 15 128 5 0.00001375002565612756 +armv7neon_mmm_f32_8x6_generic 17 128 17 0.00005946391401813929 +generic_f32_4x4 8 128 9 0.00002095153754448952 +generic_f32_4x4 12 4 7 0.000003927725122844973 +armv7neon_mmm_f32_8x6_cortexa9 25 32 6 0.000008303393078126922 +armv7neon_mmm_f32_8x4_generic 17 128 11 0.000043129532556527814 +armv7neon_mmm_f32_8x6_generic 23 128 19 0.00007849962174893098 +armv7neon_mmm_f32_8x4_cortexa9 25 32 11 0.00001896209821400977 +generic_f32_4x4 13 128 4 0.000014088650845963764 +armv7neon_mmm_f32_8x6_generic 17 32 17 0.000017768659574740396 +armv7neon_mmm_f32_8x6_generic 25 32 11 0.000016016335806526446 +generic_f32_4x4 4 128 9 0.000010729958874422145 +armv7neon_mmm_f32_8x6_generic 9 128 12 0.00002573025108782965 +armv7neon_mmm_f32_8x6_cortexa7 9 128 19 0.000052645037711891825 +armv7neon_mmm_f32_8x4_cortexa7 15 32 9 0.000009881132385003031 +armv7neon_mmm_f32_8x6_generic 15 4 11 0.0000033903277250974355 +armv7neon_mmm_f32_8x4_cortexa9 23 4 9 0.000005836030607874567 +armv7neon_mmm_f32_8x6_cortexa9 15 32 13 0.00001271636871027449 +armv7neon_mmm_f32_8x4_cortexa9 7 128 12 0.00001554481747780415 +armv7neon_mmm_f32_8x6_cortexa7 9 128 11 0.00002665664619685107 +armv7neon_mmm_f32_8x6_generic 23 4 19 0.000008276510705477435 +armv7neon_mmm_f32_8x6_cortexa7 9 128 13 0.00003963680038872068 +armv7neon_mmm_f32_8x6_generic 9 4 12 0.0000029664494321281043 +armv7neon_mmm_f32_8x4_cortexa7 9 4 8 0.0000026618049788927614 +armv7neon_mmm_f32_8x6_generic 25 4 5 0.0000034227395786859638 +armv7neon_mmm_f32_8x6_cortexa7 7 4 11 0.00000205419298901934 +armv7neon_mmm_f32_8x4_generic 15 128 4 0.000009727827829578247 +armv7neon_mmm_f32_8x4_generic 23 32 5 0.000009507106099903071 +generic_f32_4x4 13 128 3 0.000014531958964466404 +generic_f32_4x4 8 4 4 0.0000014932684997984591 +armv7neon_mmm_f32_8x6_cortexa7 9 32 12 0.0000083110843565406 +armv7neon_mmm_f32_8x4_cortexa9 9 128 4 0.000010214368441429339 +armv7neon_mmm_f32_8x6_cortexa7 24 128 11 0.000039934073240007294 +armv7neon_mmm_f32_8x4_cortexa9 25 128 9 0.0000592519545970783 +armv7neon_mmm_f32_8x4_cortexa7 17 32 13 0.000018457944384762052 +armv7neon_mmm_f32_8x6_generic 15 32 17 0.000012429374164019118 +armv7neon_mmm_f32_8x6_generic 7 4 5 0.0000012252588749082722 +generic_f32_4x4 4 4 11 0.000002142420120818127 +armv7neon_mmm_f32_8x4_cortexa7 17 32 8 0.000009331391202514006 +armv7neon_mmm_f32_8x6_cortexa9 9 32 12 0.000008401621869943723 +armv7neon_mmm_f32_8x6_cortexa7 16 32 17 0.00001235175359963665 +armv7neon_mmm_f32_8x4_cortexa9 16 4 11 0.000003828421619878369 +armv7neon_mmm_f32_8x4_cortexa9 9 4 8 0.000002691222544669364 +armv7neon_mmm_f32_8x4_cortexa9 15 32 4 0.0000036579305511926493 +armv7neon_mmm_f32_8x6_cortexa7 8 32 11 0.0000045660588709311665 +generic_f32_4x4 5 4 11 0.000003955847289217088 +generic_f32_4x4 7 128 5 0.00001437315927626327 +generic_f32_4x4 7 4 7 0.0000028946154117180987 +generic_f32_4x4 8 32 12 0.000007313658366544497 +armv7neon_mmm_f32_8x4_cortexa7 16 4 12 0.000003356919407698039 +armv7neon_mmm_f32_8x4_cortexa9 17 32 12 0.000013931859029317578 +armv7neon_mmm_f32_8x6_generic 7 128 13 0.000019807746403591995 +armv7neon_mmm_f32_8x6_generic 16 32 12 0.000007859148665693045 +armv7neon_mmm_f32_8x6_cortexa9 7 32 17 0.000006883792927542178 +armv7neon_mmm_f32_8x4_generic 24 32 4 0.000004632821669832301 +armv7neon_mmm_f32_8x6_cortexa7 15 32 18 0.000012578243798367562 +armv7neon_mmm_f32_8x6_cortexa9 17 128 7 0.000040256201289956 +armv7neon_mmm_f32_8x6_cortexa9 25 128 13 0.00007968673747995729 +armv7neon_mmm_f32_8x6_cortexa7 16 32 12 0.000008069060391993281 +armv7neon_mmm_f32_8x6_cortexa9 8 128 13 0.000020267415746870704 +armv7neon_mmm_f32_8x4_cortexa7 23 4 12 0.0000054788039005168164 +armv7neon_mmm_f32_8x4_cortexa7 7 128 9 0.000015183203798286496 +armv7neon_mmm_f32_8x4_cortexa9 25 128 11 0.00005936651957856526 +armv7neon_mmm_f32_8x6_cortexa9 7 128 13 0.00002061155210279855 +armv7neon_mmm_f32_8x6_cortexa9 16 128 12 0.000026606165960514827 +armv7neon_mmm_f32_8x4_cortexa7 24 4 7 0.000004016126966910536 +armv7neon_mmm_f32_8x4_cortexa9 16 32 4 0.000003429114676947513 +armv7neon_mmm_f32_8x4_cortexa7 9 128 7 0.000019732799833911844 +armv7neon_mmm_f32_8x6_generic 17 128 5 0.00001976574761517156 +armv7neon_mmm_f32_8x6_cortexa7 16 128 12 0.00002622611042000424 +armv7neon_mmm_f32_8x4_generic 23 128 4 0.000014300391832012492 +generic_f32_4x4 4 32 9 0.0000040473563381120025 +armv7neon_mmm_f32_8x6_cortexa9 16 4 12 0.00000281228428913529 +armv7neon_mmm_f32_8x6_generic 15 128 7 0.000025996819067612833 +armv7neon_mmm_f32_8x4_generic 25 128 13 0.00007519120583347806 +armv7neon_mmm_f32_8x6_cortexa7 25 128 18 0.00007816972627698026 +armv7neon_mmm_f32_8x6_generic 23 4 7 0.000004478475265472157 +armv7neon_mmm_f32_8x6_cortexa7 23 32 17 0.000018694963726284716 +armv7neon_mmm_f32_8x4_cortexa9 24 128 3 0.000015528002576413032 +armv7neon_mmm_f32_8x4_generic 15 4 13 0.000005299524671067579 +armv7neon_mmm_f32_8x6_cortexa9 8 32 7 0.000004505823387651013 +armv7neon_mmm_f32_8x4_cortexa9 9 4 7 0.0000028943361192655857 +armv7neon_mmm_f32_8x4_generic 9 32 4 0.0000033743539224745525 +armv7neon_mmm_f32_8x4_cortexa9 23 128 5 0.000030175843280997272 +armv7neon_mmm_f32_8x6_generic 8 4 6 0.000001069506404381519 +armv7neon_mmm_f32_8x6_cortexa9 9 32 17 0.000012530842578999487 +armv7neon_mmm_f32_8x6_generic 25 4 7 0.0000055691779651847645 +armv7neon_mmm_f32_8x6_cortexa7 24 32 18 0.000017489152288872757 +armv7neon_mmm_f32_8x4_generic 8 32 3 0.0000020846296384167996 +armv7neon_mmm_f32_8x4_generic 24 128 9 0.000042880640962818326 +armv7neon_mmm_f32_8x6_cortexa7 16 32 6 0.000004280562843189148 +generic_f32_4x4 11 4 7 0.000004035828927778453 +armv7neon_mmm_f32_8x6_cortexa7 17 128 6 0.000019945123788870955 +armv7neon_mmm_f32_8x6_cortexa7 23 4 13 0.000006544590047318178 +generic_f32_4x4 3 128 13 0.00001453817180213847 +armv7neon_mmm_f32_8x6_cortexa9 7 4 7 0.000001955919692684627 +armv7neon_mmm_f32_8x4_generic 8 4 8 0.0000014466616080596564 +armv7neon_mmm_f32_8x4_cortexa9 7 4 12 0.00000257120273391917 +armv7neon_mmm_f32_8x6_cortexa7 23 32 12 0.000012365351347550488 +armv7neon_mmm_f32_8x4_cortexa7 24 128 8 0.000028841373491782905 +generic_f32_4x4 12 4 4 0.0000019842828386500398 +armv7neon_mmm_f32_8x4_cortexa7 25 32 11 0.000018655746399310692 +armv7neon_mmm_f32_8x4_cortexa7 15 4 5 0.0000029733858994116004 +armv7neon_mmm_f32_8x4_cortexa9 8 128 3 0.000005515258544137823 +armv7neon_mmm_f32_8x6_cortexa7 24 4 7 0.000004423529851941163 +generic_f32_4x4 8 32 4 0.0000027872687254075905 +armv7neon_mmm_f32_8x6_generic 17 32 18 0.000017325844518327246 +armv7neon_mmm_f32_8x4_cortexa7 25 4 13 0.000009174920440845347 +armv7neon_mmm_f32_8x4_generic 25 128 8 0.00003833303029711307 +armv7neon_mmm_f32_8x4_generic 16 4 4 0.0000014454956826662534 +armv7neon_mmm_f32_8x4_cortexa7 17 4 4 0.0000020761952577578897 +armv7neon_mmm_f32_8x6_cortexa7 8 32 5 0.0000026618366158480003 +generic_f32_4x4 3 4 4 0.0000011319274511202646 +armv7neon_mmm_f32_8x6_cortexa7 8 32 17 0.000006455239935042666 +armv7neon_mmm_f32_8x4_cortexa7 15 4 3 0.0000018508727527174426 +armv7neon_mmm_f32_8x4_cortexa9 7 128 5 0.000010464038311697658 +armv7neon_mmm_f32_8x4_cortexa7 9 128 11 0.000029303633762621288 +armv7neon_mmm_f32_8x6_generic 15 128 6 0.000013191467223518409 +armv7neon_mmm_f32_8x6_generic 8 32 19 0.00000799220605890635 +armv7neon_mmm_f32_8x4_cortexa7 25 32 3 0.000006944685928632884 +generic_f32_4x4 5 128 7 0.000014344402542266391 +armv7neon_mmm_f32_8x6_generic 25 32 18 0.00002286640455940777 +armv7neon_mmm_f32_8x6_generic 23 4 17 0.000006658878255694999 +armv7neon_mmm_f32_8x4_cortexa9 15 32 11 0.000010153701137788554 +armv7neon_mmm_f32_8x4_cortexa7 8 32 13 0.000006386976234107792 +generic_f32_4x4 9 128 11 0.0000315155364444221 +armv7neon_mmm_f32_8x4_generic 16 128 5 0.00001883838484826069 +armv7neon_mmm_f32_8x6_generic 8 4 18 0.0000021936894870634253 +armv7neon_mmm_f32_8x6_cortexa7 24 32 17 0.00001822762304619792 +armv7neon_mmm_f32_8x6_generic 24 32 6 0.000006012607283304525 +armv7neon_mmm_f32_8x4_cortexa9 7 4 8 0.0000018992159881966028 +generic_f32_4x4 5 32 8 0.000005268936385502091 +armv7neon_mmm_f32_8x6_generic 23 32 13 0.000017886686307111186 +armv7neon_mmm_f32_8x6_generic 23 128 11 0.00004012008266709576 +armv7neon_mmm_f32_8x4_generic 7 32 3 0.000002052664755929161 +armv7neon_mmm_f32_8x4_generic 7 32 13 0.0000068232658761941584 +armv7neon_mmm_f32_8x4_cortexa9 15 4 3 0.0000018728759441468222 +armv7neon_mmm_f32_8x4_cortexa9 17 32 7 0.000009921472462970692 +armv7neon_mmm_f32_8x4_cortexa7 23 4 5 0.000004108135771777712 +armv7neon_mmm_f32_8x6_generic 17 4 18 0.0000058654454576938545 +armv7neon_mmm_f32_8x6_generic 8 32 6 0.0000023314322550231567 +armv7neon_mmm_f32_8x6_cortexa9 24 32 12 0.000011982465286422162 +armv7neon_mmm_f32_8x6_cortexa9 9 4 12 0.000003049708218088651 +armv7neon_mmm_f32_8x6_cortexa7 9 4 19 0.000005629342662833259 +generic_f32_4x4 3 32 12 0.00000437213523735793 +armv7neon_mmm_f32_8x6_generic 23 128 18 0.00005941769593137103 +armv7neon_mmm_f32_8x6_cortexa7 15 4 18 0.000004622352036845354 +armv7neon_mmm_f32_8x6_cortexa9 24 128 19 0.00007950053817997623 +armv7neon_mmm_f32_8x4_generic 16 128 4 0.000009494054447373921 +armv7neon_mmm_f32_8x6_cortexa7 7 128 17 0.000020414604977074606 +armv7neon_mmm_f32_8x4_cortexa9 7 32 12 0.000005535747117413851 +armv7neon_mmm_f32_8x6_cortexa9 17 4 5 0.000002695145363194299 +generic_f32_4x4 4 128 4 0.000003870015760471616 +armv7neon_mmm_f32_8x4_generic 15 32 11 0.000009681542653487903 +armv7neon_mmm_f32_8x6_cortexa7 7 4 13 0.000002737908308165428 +armv7neon_mmm_f32_8x4_cortexa7 9 32 7 0.000006661491356810654 +armv7neon_mmm_f32_8x4_generic 16 4 8 0.0000023864199459892154 +armv7neon_mmm_f32_8x6_generic 24 32 12 0.000011535291893158489 +armv7neon_mmm_f32_8x6_cortexa9 15 128 11 0.00002729544131868298 +armv7neon_mmm_f32_8x4_cortexa7 15 128 13 0.00003933216865420138 +generic_f32_4x4 11 4 4 0.0000021466514853413024 +armv7neon_mmm_f32_8x4_cortexa7 15 32 7 0.000006892235042247499 +armv7neon_mmm_f32_8x4_generic 24 128 7 0.000028572043647170913 +armv7neon_mmm_f32_8x4_cortexa9 9 128 12 0.000029617239836207004 +armv7neon_mmm_f32_8x6_generic 23 32 6 0.000006267673062941748 +armv7neon_mmm_f32_8x6_cortexa9 23 128 17 0.00006059797167110977 +armv7neon_mmm_f32_8x6_cortexa9 9 128 18 0.00004014798753858387 +armv7neon_mmm_f32_8x6_generic 25 32 19 0.00003066922065226773 +armv7neon_mmm_f32_8x6_generic 9 32 7 0.00000813819010478461 +armv7neon_mmm_f32_8x6_generic 8 128 18 0.000019253370491633585 +armv7neon_mmm_f32_8x6_generic 8 4 5 0.0000012747243711687536 +armv7neon_mmm_f32_8x4_cortexa7 25 4 4 0.000002551159854962235 +armv7neon_mmm_f32_8x4_cortexa9 8 128 13 0.000019869699675831398 +armv7neon_mmm_f32_8x6_cortexa9 9 128 19 0.00005342392770358122 +armv7neon_mmm_f32_8x6_cortexa9 7 4 12 0.000002051630860797947 +armv7neon_mmm_f32_8x6_cortexa9 16 128 13 0.00004015173417099066 +armv7neon_mmm_f32_8x6_cortexa9 23 128 12 0.0000403164031404094 +armv7neon_mmm_f32_8x4_generic 23 128 5 0.000028627967286799668 +armv7neon_mmm_f32_8x4_cortexa9 25 4 8 0.000004634829587692317 +armv7neon_mmm_f32_8x4_cortexa7 15 128 4 0.000010126569600218532 +armv7neon_mmm_f32_8x4_cortexa9 23 32 4 0.000005123975664670012 +armv7neon_mmm_f32_8x6_generic 8 4 17 0.0000024242275965441413 +armv7neon_mmm_f32_8x6_cortexa7 25 128 7 0.000052727920949761586 +armv7neon_mmm_f32_8x4_cortexa7 16 32 9 0.000009437089513621594 +armv7neon_mmm_f32_8x4_generic 23 4 9 0.000005734226973840947 +armv7neon_mmm_f32_8x4_cortexa7 17 128 12 0.0000433599443382883 +armv7neon_mmm_f32_8x4_cortexa7 24 32 3 0.000005409349524302138 +armv7neon_mmm_f32_8x6_cortexa7 8 4 7 0.0000018575873013342354 +armv7neon_mmm_f32_8x6_generic 7 128 19 0.000026452799909726964 +armv7neon_mmm_f32_8x4_cortexa9 24 32 12 0.000013540981249723916 +armv7neon_mmm_f32_8x4_cortexa9 23 128 9 0.000044894692860566556 +armv7neon_mmm_f32_8x6_cortexa9 9 4 17 0.000004505536489134122 +armv7neon_mmm_f32_8x6_generic 15 32 11 0.000008507986633036132 +armv7neon_mmm_f32_8x6_cortexa7 7 32 19 0.000008805049700237493 +armv7neon_mmm_f32_8x4_generic 17 4 4 0.0000020583169447652696 +armv7neon_mmm_f32_8x4_generic 7 32 9 0.000005237513207003328 +armv7neon_mmm_f32_8x6_cortexa9 8 4 5 0.0000012863007612366299 +armv7neon_mmm_f32_8x6_cortexa7 16 128 17 0.00003972939922069338 +armv7neon_mmm_f32_8x6_cortexa9 16 32 17 0.000012476036240479964 +generic_f32_4x4 5 128 4 0.000007343673936530293 +armv7neon_mmm_f32_8x6_cortexa7 17 4 13 0.000006264666040125929 +armv7neon_mmm_f32_8x6_cortexa9 25 128 18 0.00007930397959821057 +armv7neon_mmm_f32_8x6_cortexa9 16 4 5 0.0000020584584410478216 +armv7neon_mmm_f32_8x4_cortexa9 7 32 5 0.00000378972657151533 +armv7neon_mmm_f32_8x6_cortexa9 16 4 18 0.000003945842563203367 +armv7neon_mmm_f32_8x4_generic 23 32 7 0.000009665874611953724 +armv7neon_mmm_f32_8x6_cortexa7 25 128 6 0.000026459671946913376 +generic_f32_4x4 7 4 3 0.0000017591858474192314 +armv7neon_mmm_f32_8x6_cortexa7 16 4 18 0.000003943668999181098 +generic_f32_4x4 4 128 7 0.0000074032510257738404 +armv7neon_mmm_f32_8x4_generic 16 128 13 0.0000374668929079076 +armv7neon_mmm_f32_8x4_cortexa9 9 4 3 0.0000017875665004976476 +armv7neon_mmm_f32_8x4_cortexa9 15 128 9 0.00003010940924012135 +armv7neon_mmm_f32_8x6_cortexa9 17 32 13 0.000018295003723366057 +armv7neon_mmm_f32_8x6_cortexa9 9 4 13 0.000004395459186352696 +armv7neon_mmm_f32_8x6_cortexa7 8 4 12 0.0000016616372440035075 +generic_f32_4x4 5 128 12 0.000021001699183635672 +armv7neon_mmm_f32_8x6_generic 25 4 6 0.000002871872138224116 +armv7neon_mmm_f32_8x6_generic 7 128 5 0.000006875620566369796 +generic_f32_4x4 5 4 4 0.000001600585638499612 +armv7neon_mmm_f32_8x4_cortexa9 23 4 4 0.0000022070594896923405 +armv7neon_mmm_f32_8x6_cortexa9 8 32 17 0.0000065161951430999815 +armv7neon_mmm_f32_8x6_cortexa7 17 4 19 0.000008057471663557758 +armv7neon_mmm_f32_8x6_cortexa9 8 128 7 0.000013701107804035602 +armv7neon_mmm_f32_8x4_cortexa9 23 128 11 0.00004502007320686303 +armv7neon_mmm_f32_8x4_cortexa7 8 32 11 0.000004991462488279611 +armv7neon_mmm_f32_8x6_cortexa9 24 32 18 0.00001769207861513086 +armv7neon_mmm_f32_8x4_generic 25 128 4 0.000019508351532460557 +armv7neon_mmm_f32_8x6_cortexa7 23 4 12 0.0000044400455773711846 +armv7neon_mmm_f32_8x6_cortexa9 24 32 5 0.000006843511403564935 +armv7neon_mmm_f32_8x6_cortexa9 25 128 11 0.000053747765759359946 +armv7neon_mmm_f32_8x4_cortexa7 25 4 11 0.000007253341541711691 +armv7neon_mmm_f32_8x6_cortexa9 23 32 19 0.000024549141146277552 +armv7neon_mmm_f32_8x6_cortexa7 16 128 11 0.000026732044849478285 +armv7neon_mmm_f32_8x6_cortexa9 23 128 19 0.0000801357471530611 +generic_f32_4x4 3 32 4 0.0000017966908912230104 +armv7neon_mmm_f32_8x6_cortexa7 24 128 17 0.00005929496892410179 +armv7neon_mmm_f32_8x6_cortexa9 23 32 12 0.000012490651961414628 +armv7neon_mmm_f32_8x4_cortexa7 16 128 9 0.000029084669529414875 +armv7neon_mmm_f32_8x6_cortexa7 7 128 19 0.000026966388966864694 +armv7neon_mmm_f32_8x4_cortexa7 23 32 8 0.000009566852853696024 +armv7neon_mmm_f32_8x4_generic 17 32 13 0.00001785315018140605 +armv7neon_mmm_f32_8x4_cortexa7 23 4 4 0.00000218193596958567 +armv7neon_mmm_f32_8x4_generic 9 32 7 0.000006462010776076977 +armv7neon_mmm_f32_8x4_generic 23 32 8 0.000009259693411977238 +armv7neon_mmm_f32_8x6_generic 15 128 12 0.000025997314252108204 +generic_f32_4x4 12 32 5 0.000007719446523382617 +armv7neon_mmm_f32_8x6_cortexa9 15 128 17 0.000040755037497853366 +armv7neon_mmm_f32_8x4_generic 24 32 5 0.00000927870929356689 +armv7neon_mmm_f32_8x4_cortexa9 23 32 11 0.00001470624762457902 +armv7neon_mmm_f32_8x4_cortexa7 7 128 7 0.000010321518649139075 +armv7neon_mmm_f32_8x4_cortexa9 17 32 3 0.000005401396003956081 +armv7neon_mmm_f32_8x4_cortexa7 25 128 5 0.00003910615015651058 +armv7neon_mmm_f32_8x6_cortexa9 17 128 18 0.00005978117998553773 +armv7neon_mmm_f32_8x6_generic 7 4 11 0.000002036996116605385 +armv7neon_mmm_f32_8x6_cortexa9 15 4 6 0.0000019083928751646085 +armv7neon_mmm_f32_8x6_cortexa9 9 4 19 0.000005613783088587898 +armv7neon_mmm_f32_8x4_cortexa7 24 32 4 0.0000047895509427620865 +armv7neon_mmm_f32_8x4_cortexa9 8 32 12 0.000004854203771648976 +armv7neon_mmm_f32_8x4_generic 25 4 11 0.000007170056956827459 +armv7neon_mmm_f32_8x6_cortexa7 24 32 6 0.000006172583934492533 +armv7neon_mmm_f32_8x6_generic 23 32 18 0.000017730914760808268 +generic_f32_4x4 9 128 4 0.00001072609031817021 +armv7neon_mmm_f32_8x6_generic 16 32 7 0.000008131169126293927 +armv7neon_mmm_f32_8x6_cortexa9 8 32 11 0.000004596930720411251 +generic_f32_4x4 3 128 3 0.000003987497974732173 +armv7neon_mmm_f32_8x6_generic 16 32 18 0.000011518507884431023 +generic_f32_4x4 12 4 13 0.0000067677797185944386 +armv7neon_mmm_f32_8x6_cortexa7 17 4 12 0.00000420578831765641 +armv7neon_mmm_f32_8x6_cortexa7 25 128 11 0.00005298746347142357 +armv7neon_mmm_f32_8x6_cortexa9 17 4 17 0.0000064380265240168815 +armv7neon_mmm_f32_8x4_cortexa9 16 32 3 0.000003850508528044626 +armv7neon_mmm_f32_8x4_cortexa7 15 4 9 0.0000041824910345443945 +armv7neon_mmm_f32_8x6_cortexa7 16 4 13 0.000004283476073931167 +armv7neon_mmm_f32_8x6_cortexa9 15 32 19 0.00001676006214460718 +armv7neon_mmm_f32_8x6_cortexa9 9 4 6 0.0000017882273242368877 +generic_f32_4x4 11 4 12 0.000005385894826689573 +armv7neon_mmm_f32_8x6_generic 25 128 11 0.00005279747695062305 +armv7neon_mmm_f32_8x4_generic 25 128 11 0.000057163651606893397 +armv7neon_mmm_f32_8x6_generic 16 32 13 0.000011809288098621582 +armv7neon_mmm_f32_8x6_cortexa7 16 128 7 0.00002656376898311115 +armv7neon_mmm_f32_8x6_generic 8 4 19 0.0000028980842866481956 +armv7neon_mmm_f32_8x6_cortexa9 25 4 17 0.000008360160748075129 +armv7neon_mmm_f32_8x6_cortexa7 25 32 17 0.00002416762541107772 +armv7neon_mmm_f32_8x4_cortexa7 9 4 11 0.000003971891623914232 +armv7neon_mmm_f32_8x6_cortexa7 8 128 18 0.000019766077162036943 +armv7neon_mmm_f32_8x4_generic 15 32 3 0.0000036533992903578352 +armv7neon_mmm_f32_8x6_cortexa7 25 32 5 0.0000087944366611451 +armv7neon_mmm_f32_8x6_cortexa7 7 128 5 0.000007076691588612229 +armv7neon_mmm_f32_8x6_generic 23 4 6 0.000002427909563551317 +armv7neon_mmm_f32_8x4_cortexa9 17 4 9 0.000005573474441160942 +armv7neon_mmm_f32_8x4_cortexa9 25 4 5 0.000005112325491179266 +generic_f32_4x4 8 32 5 0.000005323391922801219 +armv7neon_mmm_f32_8x6_cortexa7 24 128 18 0.00005856236723783224 +armv7neon_mmm_f32_8x6_generic 23 4 18 0.000006215918837505929 +armv7neon_mmm_f32_8x4_generic 16 32 4 0.000003260594187485516 +armv7neon_mmm_f32_8x4_generic 8 4 11 0.000002124764231070715 +armv7neon_mmm_f32_8x6_cortexa9 15 128 7 0.000027104995655962794 +armv7neon_mmm_f32_8x4_cortexa7 23 4 3 0.000002539994674766333 +armv7neon_mmm_f32_8x4_cortexa9 15 128 13 0.000040120613837028105 +armv7neon_mmm_f32_8x4_generic 9 32 5 0.000006416826225047117 +armv7neon_mmm_f32_8x6_generic 9 128 13 0.000039231938860404315 +armv7neon_mmm_f32_8x4_cortexa7 17 32 9 0.000014080751383917914 +armv7neon_mmm_f32_8x4_generic 24 4 5 0.0000038475341959329615 +armv7neon_mmm_f32_8x6_cortexa7 17 32 19 0.00002385320479583871 +armv7neon_mmm_f32_8x4_cortexa9 25 4 7 0.000005243554539274239 +armv7neon_mmm_f32_8x4_cortexa9 8 128 7 0.000010306643098752432 +armv7neon_mmm_f32_8x4_cortexa7 9 32 9 0.000009612025649062918 +armv7neon_mmm_f32_8x6_generic 7 128 6 0.000006918233017670394 +armv7neon_mmm_f32_8x6_cortexa7 23 32 6 0.000006480642874727307 +armv7neon_mmm_f32_8x4_cortexa7 17 4 8 0.0000036467312614380765 +armv7neon_mmm_f32_8x6_cortexa9 9 4 18 0.000004263423188010347 +armv7neon_mmm_f32_8x4_cortexa9 23 4 3 0.000002565453674896801 +armv7neon_mmm_f32_8x6_cortexa7 17 128 17 0.00005934218053895787 +armv7neon_mmm_f32_8x6_cortexa9 9 128 11 0.00002703084778716899 +armv7neon_mmm_f32_8x6_cortexa7 8 32 19 0.000008254990878887053 +armv7neon_mmm_f32_8x4_cortexa7 24 4 11 0.000005437311620791031 +armv7neon_mmm_f32_8x6_cortexa7 17 32 12 0.000012103957972190128 +armv7neon_mmm_f32_8x6_cortexa7 16 32 11 0.000008575915963542344 +armv7neon_mmm_f32_8x6_cortexa7 15 128 13 0.00003995376024317136 +armv7neon_mmm_f32_8x4_cortexa7 25 128 12 0.00005747997493377876 +armv7neon_mmm_f32_8x6_cortexa7 9 4 17 0.000004513992669521213 +armv7neon_mmm_f32_8x4_cortexa9 7 32 11 0.000005542086596158504 +armv7neon_mmm_f32_8x4_generic 16 32 8 0.00000600791906169857 +armv7neon_mmm_f32_8x6_cortexa9 8 4 6 0.000001079992540189233 +armv7neon_mmm_f32_8x6_cortexa7 15 4 7 0.0000033103105460849674 +armv7neon_mmm_f32_8x6_generic 7 4 18 0.0000027798204465348195 +armv7neon_mmm_f32_8x4_cortexa9 9 4 4 0.0000016086729123039774 +armv7neon_mmm_f32_8x4_cortexa9 17 4 13 0.000007139907020236834 +armv7neon_mmm_f32_8x6_generic 7 4 17 0.000002811022144491351 +armv7neon_mmm_f32_8x4_generic 16 4 12 0.00000330778427406012 +armv7neon_mmm_f32_8x6_cortexa7 17 32 7 0.000012331946306204285 +armv7neon_mmm_f32_8x4_cortexa7 23 128 4 0.000014866985895709814 +armv7neon_mmm_f32_8x6_generic 8 128 5 0.000006930294425648014 +armv7neon_mmm_f32_8x6_cortexa7 7 128 11 0.000013755100434106755 +armv7neon_mmm_f32_8x4_cortexa7 9 4 3 0.0000017694089316929863 +armv7neon_mmm_f32_8x4_cortexa9 25 128 5 0.00003989811331140034 +armv7neon_mmm_f32_8x6_cortexa9 24 4 18 0.000005659337689416026 +armv7neon_mmm_f32_8x6_cortexa7 7 32 7 0.000004615694133689407 +armv7neon_mmm_f32_8x4_generic 23 4 12 0.000005414200630328292 +armv7neon_mmm_f32_8x6_cortexa7 15 128 7 0.00002674534329947021 +armv7neon_mmm_f32_8x4_cortexa9 23 128 13 0.0000596298628444228 +armv7neon_mmm_f32_8x6_cortexa9 7 32 19 0.000008889380829716555 +armv7neon_mmm_f32_8x6_cortexa7 8 4 17 0.0000025025489216272165 +armv7neon_mmm_f32_8x6_cortexa9 24 128 5 0.000020677151329306246 +armv7neon_mmm_f32_8x6_cortexa9 17 32 17 0.000018491383989214777 +armv7neon_mmm_f32_8x4_cortexa7 8 4 11 0.000002146399138479012 +armv7neon_mmm_f32_8x6_cortexa7 7 4 19 0.0000034947036576981197 +armv7neon_mmm_f32_8x6_cortexa7 15 32 13 0.00001261074648998989 +armv7neon_mmm_f32_8x6_cortexa7 17 128 12 0.000039467248529014934 +armv7neon_mmm_f32_8x6_cortexa9 15 128 13 0.0000405486222699268 +armv7neon_mmm_f32_8x6_cortexa7 8 4 6 0.0000010838257676938484 +armv7neon_mmm_f32_8x4_cortexa7 8 128 11 0.00001478367268084039 +generic_f32_4x4 13 4 5 0.000005005970941882336 +armv7neon_mmm_f32_8x4_generic 9 4 5 0.0000027981911123646093 +armv7neon_mmm_f32_8x4_cortexa9 24 32 3 0.000005499754513818659 +armv7neon_mmm_f32_8x4_cortexa9 16 32 8 0.00000632372871965183 +armv7neon_mmm_f32_8x6_generic 25 32 7 0.000015750403152655726 +armv7neon_mmm_f32_8x6_cortexa7 9 128 6 0.000013469652928515112 +generic_f32_4x4 5 128 8 0.000014178354640152658 +armv7neon_mmm_f32_8x4_cortexa9 25 32 13 0.00002471922478461092 +armv7neon_mmm_f32_8x4_generic 15 32 7 0.000006693321105057111 +armv7neon_mmm_f32_8x4_cortexa9 23 128 8 0.000029940232861406133 +armv7neon_mmm_f32_8x4_cortexa7 7 128 5 0.000010257823038892243 +generic_f32_4x4 4 32 12 0.000003911119050209719 +armv7neon_mmm_f32_8x6_cortexa7 8 128 7 0.000013528068455942153 +armv7neon_mmm_f32_8x4_cortexa7 24 4 5 0.000003893712023496801 +armv7neon_mmm_f32_8x4_cortexa7 9 4 5 0.00000282867688987085 +armv7neon_mmm_f32_8x6_generic 9 4 6 0.0000017337687872243595 +armv7neon_mmm_f32_8x4_generic 8 32 11 0.000004839072297522764 +armv7neon_mmm_f32_8x4_cortexa7 16 32 8 0.000006215610136029107 +armv7neon_mmm_f32_8x4_cortexa7 24 128 12 0.00004296408794693615 +armv7neon_mmm_f32_8x6_generic 23 32 7 0.000012139369977001298 +armv7neon_mmm_f32_8x6_cortexa9 23 32 7 0.00001262657347999726 +armv7neon_mmm_f32_8x6_generic 9 32 18 0.000011811540149507637 +generic_f32_4x4 4 4 12 0.0000019950687694365225 +armv7neon_mmm_f32_8x4_cortexa9 17 32 13 0.000018760347719013833 +armv7neon_mmm_f32_8x4_generic 24 128 8 0.000027893007985374452 +armv7neon_mmm_f32_8x6_generic 7 32 12 0.000004596558109391685 +generic_f32_4x4 12 32 12 0.0000107075710637508 +armv7neon_mmm_f32_8x6_generic 16 4 18 0.000003888078280974674 +generic_f32_4x4 12 128 7 0.000021189568041458556 +armv7neon_mmm_f32_8x6_generic 16 128 11 0.000025972024072475202 +armv7neon_mmm_f32_8x6_cortexa7 15 4 13 0.000004683720683139814 +armv7neon_mmm_f32_8x6_cortexa9 7 128 6 0.000007213870080063144 +armv7neon_mmm_f32_8x6_cortexa9 23 128 11 0.00004071170407195971 +armv7neon_mmm_f32_8x6_generic 24 4 5 0.000002793903924978298 +armv7neon_mmm_f32_8x4_cortexa9 15 32 9 0.000010036115634563751 +generic_f32_4x4 9 128 9 0.000031422044792444864 +armv7neon_mmm_f32_8x4_generic 25 32 4 0.000006133898665196312 +generic_f32_4x4 13 128 8 0.00002772292203595808 diff --git a/vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.rs b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.rs new file mode 100644 index 000000000..6ccb6adbe --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.rs @@ -0,0 +1,16 @@ +use crate::frame::mmm::CostModel; + pub fn model() -> CostModel<'static> { + CostModel { + big_product_mkn_threshold: 4194036.0, + big_product_kernel_choice: "armv7neon_mmm_f32_8x6_cortexa9", + kernels: &["armv7neon_mmm_f32_8x4_cortexa7", "armv7neon_mmm_f32_8x4_cortexa9", "armv7neon_mmm_f32_8x4_generic", "armv7neon_mmm_f32_8x6_cortexa7", "armv7neon_mmm_f32_8x6_cortexa9", "armv7neon_mmm_f32_8x6_generic", "generic_f32_4x4"], + mrs: &[4, 8], + nrs: &[4, 6], + feat_norm_mean: &[4.582296677813486, 4.595402322442016, 4.571260231028445, 13.748959231283994, 1.5179177668804225, 0.7575757575757576, 3.5337608449641644, 0.8831887338111405, 1.5048409405255878, 0.7526719476926946, 2.489123601156796, 0.8326417704011065], + feat_norm_stddev: &[1.2635817489024164, 1.2723436827339079, 1.2620157548883217, 1.3497763942449361, 1.1141159992246472, 0.42854956435545316, 2.2880460409304937, 0.32119525880720723, 1.1154901716833412, 0.43145902105435263, 1.7051378780434328, 0.37329539587896904], + w1: &[0.5391961336135864, -0.32089367508888245, 0.203999862074852, -0.10011337697505951, 0.09040801972150803, -0.14198464155197144, 0.031854499131441116, 0.12334256619215012, 0.15339604020118713, -0.20091375708580017, -0.014548280276358128, 0.12154694646596909, 0.31225234270095825, 0.10782113671302795, 0.44618168473243713, 0.8267014026641846, -0.1204405128955841, -0.08261110633611679, -0.052502430975437164, 0.3066086769104004, 0.1493932157754898, -0.14119412004947662, -0.1985343098640442, 0.19361039996147156, -0.4636686146259308, 0.08120443671941757, 0.03210291638970375, 0.17303235828876495, 0.16502155363559723, -0.19771894812583923, -0.11060577630996704, 0.08698348701000214, -0.07793445140123367, 0.32749465107917786, 0.3663202226161957, -0.4629170894622803, -0.1586134433746338, 0.4272242486476898, -0.12016090005636215, -0.17830348014831543, -0.05493386462330818, -0.036517318338155746, 0.01293050218373537, 0.016577009111642838, 0.10738552361726761, -0.3662779629230499, -0.2917434275150299, 0.5752639770507812, 0.11406347155570984, 0.8622727394104004, 0.07158719748258591, 0.29530274868011475, -0.11287810653448105, 0.12262264639139175, 0.02478562481701374, 0.17749948799610138, -0.036227867007255554, 0.10140471905469894, -0.011896232143044472, -0.021761735901236534, 0.06046223267912865, 0.5727048516273499, -0.007826486602425575, 0.3863913118839264, -0.04224887117743492, 0.056023009121418, -0.02467598207294941, 0.0385640449821949, 0.0219524335116148, -0.03437826409935951, -0.2060588151216507, 0.2895224988460541, 0.10751669108867645, 0.00845037866383791, -0.1836385875940323, -0.24757762253284454, -0.09606243669986725, 0.03918633610010147, 0.07913251221179962, 0.06499160826206207, -0.08156774938106537, 0.08835449814796448, 0.13896305859088898, -0.16936920583248138, 0.010146846994757652, -0.42553824186325073, 0.39916151762008667, -0.004584060981869698, -0.10256388038396835, 0.041573416441679, 0.05155385658144951, 0.015019520185887814, 0.09554271399974823, -0.20487457513809204, -0.4146610200405121, -0.773110032081604, 0.3662724494934082, -0.23762361705303192, 0.6974321603775024, 0.8990052938461304, 0.02772649936378002, 0.042197681963443756, -0.0022736566606909037, -0.028843341395258904, -0.4559306204319, 0.6326258778572083, 0.4568879008293152, -0.4892531633377075, -0.032289132475852966, 0.04378330707550049, -0.4118069112300873, 0.2493579089641571, -0.021955665200948715, -0.01538186427205801, -0.21400974690914154, -0.09971866756677628, 0.02185226045548916, -0.18125569820404053, -0.13828244805335999, -0.20846466720104218, -0.10373540222644806, 0.4842098653316498, -0.06586655229330063, 0.03369470313191414, 0.013142148964107037, 0.017437899485230446, 0.15891534090042114, 0.5269678831100464, 0.02546108327805996, -0.004250233061611652, -5.8676625485531986e-05, 0.06777831166982651, -0.14051207900047302, 0.6876491904258728, -0.3455996811389923, 0.0378129817545414, 0.15291574597358704, -0.03829087316989899, -0.05761529877781868, -0.05344394966959953, 0.1421334147453308, -0.3614322543144226, -0.21606910228729248, 0.1558765172958374, 0.14480257034301758, -0.1799984872341156, 0.4238421618938446, -0.08961529284715652, -0.04010967165231705, 0.14250615239143372, -0.0038367861416190863, -0.044531334191560745, -0.08958051353693008, -0.1577986180782318, -0.5795103907585144, -1.1048516035079956, 0.16444185376167297, -0.09989812225103378, -0.26304998993873596, 0.040687527507543564, 0.065303735435009, -0.06267901510000229, 0.08742637187242508, 0.02480895072221756, 0.23719966411590576, -0.09509539604187012, 0.39278310537338257, 0.18978112936019897, 0.11301649361848831, -0.16268616914749146, -0.14119602739810944, -0.04518252611160278, 0.10456270724534988, 0.008367948234081268, 0.004280170891433954, 0.01894286274909973, -0.1547478288412094, 0.197267547249794, 0.20271208882331848, -0.28377917408943176, -0.26751258969306946, 0.15954937040805817, 0.33988064527511597, 0.16848208010196686, 0.11668887734413147, -0.057433612644672394, -0.049777109175920486, 0.00744214653968811, -0.012330793775618076, -0.08413149416446686, -0.2053118497133255, 0.09235486388206482, -0.1354941576719284, 0.41610953211784363, 0.8428494334220886, 0.880882740020752, 0.024029193446040154, -0.08453702926635742, 0.00771496444940567, -0.013013732619583607, -0.23804998397827148, 0.4110376536846161, 0.23720477521419525, -0.13951541483402252, -0.1747516244649887, -0.34215790033340454, 0.014357345178723335, 0.34224632382392883, 0.03783192113041878, 0.01125166192650795, -0.08253959566354752, 0.015717405825853348, -0.22759634256362915, 0.3980898857116699, 0.2427154779434204, -0.3319437801837921, 0.11146843433380127, -0.9666317105293274, -0.12227121740579605, -0.1948898285627365, -0.030186548829078674, 0.0011711223050951958, -0.040062546730041504, -0.16316139698028564, -0.14714862406253815, 0.13224393129348755, -0.0019320327555760741, -0.09674090147018433, 0.3630145490169525, -0.019513679668307304, -0.07729464769363403, -0.34592965245246887, 0.15215164422988892, 0.046678490936756134, 0.06675180792808533, -0.08943335711956024, 0.006386714521795511, 0.10086977481842041, -0.07409387081861496, -0.19604018330574036, -0.042700666934251785, 0.12124726921319962, 0.5694677233695984, 0.25033196806907654, 0.01862989366054535, 0.0053687929175794125, -0.0017405126709491014, -0.01638556271791458, -0.32222822308540344, 0.5348804593086243, 0.5546748042106628, 1.2770946025848389, 0.11648745834827423, -0.058405984193086624, -0.2997635006904602, -0.2040756195783615, 0.15525077283382416, -0.12436354905366898, -0.089121975004673, 0.06441225856542587, 0.2444663643836975, -0.3495825529098511, -0.05243751034140587, 0.08752834796905518, 0.08800745010375977, -0.09807545691728592, -0.3823537230491638, -0.13047000765800476, 0.029333092272281647, 0.11618250608444214, -0.0638590008020401, -0.09598273783922195, -0.07390140742063522, 0.09151650220155716, -0.1700282245874405, 0.23608872294425964, 0.24879834055900574, -0.15922772884368896, -0.33795130252838135, -0.053850702941417694, 0.1014639139175415, -0.05480973795056343, -0.06753639131784439, 0.04606246575713158, -0.07082260400056839, 0.07848796248435974, 0.05011916160583496, -0.05570689216256142, -0.14584510028362274, -0.8908579349517822, -0.5959509611129761, -0.8982105255126953, 0.0788002535700798, -0.03575791418552399, 0.052424680441617966, -0.08019822835922241, 0.10848221182823181, 0.0957408994436264, 0.1457311511039734, -0.1956494003534317, -0.21669772267341614, 0.9854136109352112, -0.23215851187705994, 0.16359730064868927, 0.02025810070335865, -0.08975380659103394, -0.013868067413568497, -0.22188447415828705, 0.020666224882006645, -0.22304703295230865, 0.06407633423805237, 0.19804184138774872, -0.05285267159342766, -0.5510660409927368, -0.8522927761077881, -0.6061599850654602, 0.08484024554491043, -0.08973539620637894, 0.013228937052190304, -0.07834818214178085, 0.02858446165919304, -0.3826225996017456, 0.059726644307374954, 0.1139102503657341, -0.19311848282814026, 0.05770142376422882, 0.22584261000156403, 0.34312352538108826, -0.15085645020008087, 0.34372228384017944, 0.08070214092731476, 0.5744000673294067, -0.08693907409906387, -0.003695777617394924, -0.1334235966205597, 0.06418291479349136, 0.02848576195538044, -0.34958112239837646, -0.3419312834739685, -0.09599799662828445, 0.015022341161966324, 0.03255023807287216, 0.09713662415742874, -0.1730588674545288, 0.1904430240392685, -0.32815566658973694, -0.16749203205108643, 0.35736411809921265, -0.503787100315094, 0.5057004690170288, -0.47198373079299927, 0.11386436969041824, -0.0722493901848793, 0.03358639404177666, 0.005928087048232555, -0.05637047439813614, 0.06552420556545258, -0.07283362001180649, -0.09314802289009094, 0.13586974143981934, -0.5054865479469299, -0.18127793073654175, 0.08853171765804291, -0.13333705067634583, -0.2623322308063507, 0.17757390439510345, 0.04408252611756325, -0.0277855321764946, -0.05175777152180672, 0.40444689989089966, -0.03518976643681526, -0.36402902007102966, -0.019589770585298538, -0.05277400091290474, -0.27273234724998474, -0.07373850792646408, -0.058221735060214996, 0.14292845129966736, -0.005004828795790672, -0.05554938316345215, 0.20361287891864777, -0.30462127923965454, -0.1140812486410141, 0.16081976890563965, -0.07133162021636963, -0.20463652908802032, 0.34733739495277405, 0.17099761962890625, 0.025868643075227737, -0.02960631065070629, -0.02717636525630951, 0.02027258090674877, -0.13165302574634552, 0.36201152205467224, 0.5002728700637817, 0.39691421389579773, -0.04605599492788315, 0.28801581263542175, -1.0140656232833862, -0.5481916666030884, 0.0896061584353447, -0.049390073865652084, 0.08813252300024033, -0.1784677952528, 0.34480658173561096, -0.36402803659439087, 0.16948284208774567, 0.45740315318107605, -0.23747704923152924, 0.580975353717804, -0.24338461458683014, -0.11410018056631088, 0.06431885808706284, -0.0317281149327755, -0.024683356285095215, -0.10083278268575668, 0.024547407403588295, -0.16270779073238373, -0.07757837325334549, 0.19732129573822021, 0.03790999948978424, -0.18804220855236053, 0.8675169348716736, 0.5377629399299622, -0.0036910742055624723, -0.0016441351035609841, -0.030448857694864273, 0.07757671177387238, -0.1475408971309662, 0.613543689250946, 0.30266445875167847, 0.12106148898601532, 0.05485830456018448, -0.04748840630054474, -0.23233623802661896, -0.1949906051158905, 0.05692804977297783, 0.07474583387374878, -0.11879625171422958, 0.07200933247804642, -0.012743310071527958, -0.02546215057373047, -0.3765566349029541, 0.28637346625328064, -0.18051809072494507, 0.5034835934638977, -0.34970414638519287, -0.2386687994003296, -0.03804561868309975, -0.03649319335818291, -0.10303670912981033, 0.1299818456172943, 0.24685724079608917, -0.34168556332588196, -0.086674265563488, 0.32085898518562317, 0.48488491773605347, -0.522548258304596, 0.309568852186203, 0.167385995388031, 0.11308691650629044, 0.14733079075813293, -0.22416195273399353, 0.14763982594013214, -0.07242503017187119, 0.07601745426654816, -0.10375087708234787, -0.03409396857023239, -0.35759225487709045, 0.18936687707901, 0.28248289227485657, 0.26482364535331726, 0.061123836785554886, -0.021603189408779144, -0.13469825685024261, 0.07248867303133011, -0.03464066982269287, 0.06557167321443558, 0.16093865036964417, -0.1718607246875763], + b1: &[-0.3893989324569702, -0.2791002690792084, 0.07853052020072937, -0.4629746377468109, -0.7148261070251465, 0.8680436015129089, -0.46459102630615234, 0.0404132716357708, -0.44012945890426636, 0.08434166759252548, 0.32190972566604614, -0.20194832980632782, -0.3781348764896393, -0.23968002200126648, -0.581799328327179, 0.6500483155250549, -0.6192854046821594, 0.5922245383262634, 0.44006091356277466, 0.2982949912548065, 0.6136102676391602, -0.597486138343811, -0.3697699308395386, -0.45241132378578186, 0.60771644115448, -0.3373708128929138, 0.5697194337844849, 0.4784911870956421, -0.49601855874061584, 0.5023709535598755, 0.21592296659946442, -0.45412343740463257, 0.5104787945747375, 0.558862566947937, 0.4729066491127014, -0.5520593523979187, -0.5120576620101929, -0.7157037258148193, 0.12596718966960907, 0.4773174524307251], + w2: &[0.1379607617855072, 0.09308824688196182, -0.2596932649612427, 0.4461972713470459, 0.3480601906776428, 0.036684323102235794, 0.4057384729385376, -0.3081648051738739, 0.4561280608177185, 0.2749394178390503, -0.1400817334651947, 0.3145979046821594, -0.16919250786304474, 0.7247185707092285, 0.3479674756526947, -0.7546817064285278, 0.38135531544685364, -0.3939172029495239, -0.038021210581064224, 0.026914050802588463, -0.5281358361244202, 0.39009571075439453, 0.4090450406074524, 0.5053343772888184, -0.23938016593456268, 0.488080233335495, -0.38536468148231506, -0.23763014376163483, 0.2661689519882202, -0.14746293425559998, -0.7541974186897278, 0.27726081013679504, -0.4072169065475464, -0.8030230402946472, -0.386343389749527, 0.6674754619598389, 0.06677238643169403, 0.5055669546127319, -0.44330647587776184, -0.3423362970352173, -0.10948927700519562, 0.11290912330150604, -0.2759379744529724, 0.5522158741950989, -0.5766478776931763, 0.7288797497749329, -0.4967955946922302, -0.5466133952140808, 0.7254890203475952, 0.1274457424879074, 0.3098924458026886, 0.2524661719799042, -0.7162019610404968, 0.19503603875637054, -0.5212412476539612, 0.0968603864312172, 0.4835629463195801, -0.5865079164505005, 0.27647316455841064, 0.1975109577178955, -0.845225989818573, 0.4172143042087555, -0.014424118213355541, -0.24702520668506622, -0.16123531758785248, -0.047759659588336945, -0.09985388815402985, 0.10430619865655899, 0.53556889295578, 0.2595883011817932, 0.11729882657527924, 0.36996161937713623, -0.41997936367988586, -0.3332042694091797, 0.2527308464050293, 0.6039140820503235, 0.35183605551719666, 0.42042237520217896, -0.2265913337469101, -0.06852111965417862, 0.3749903440475464, 0.3698897361755371, -0.43096107244491577, 0.1275794953107834, 0.27926334738731384, -0.3282606303691864, 0.290679931640625, -0.14467079937458038, 0.3357028663158417, -0.0683436468243599, -0.35492125153541565, -0.14275093376636505, -0.1504347324371338, 0.1782987266778946, 0.07464402168989182, -0.2788643538951874, 0.5896115303039551, -0.314520001411438, -0.3235827684402466, -0.2899278700351715, -0.21264874935150146, 0.41862159967422485, 0.3237628936767578, 0.2948566973209381, -0.6101413369178772, -0.025511808693408966, -0.4238346517086029, -0.28283095359802246, 0.32077667117118835, -0.34138476848602295, -0.5257527232170105, 0.24129967391490936, -0.38175472617149353, -0.20559589564800262, -0.11267697811126709, 0.32475054264068604, 0.29545050859451294, 0.0010625360300764441, 0.4097916781902313, -0.3120468556880951, 0.3134985566139221, 0.33620578050613403, -0.27408266067504883, -0.0118736382573843, 0.21356475353240967, -0.6716119647026062, 0.14166241884231567, 0.020748334005475044, 0.27158322930336, -0.27066248655319214, -0.5078546404838562, 0.39642488956451416, 0.4044502079486847, 0.1363500952720642, 0.38089585304260254, -0.18438327312469482, -0.08652642369270325, 0.05718545988202095, -0.5758764743804932, 0.0948563665151596, 0.298057496547699, -0.07299521565437317, -0.24248233437538147, 0.29135069251060486, -0.44556060433387756, 0.6689074039459229, -0.12930674850940704, -0.12669484317302704, 0.1074564978480339, -0.20472179353237152, 0.14787982404232025, -0.13180267810821533, 0.3045596182346344, -0.3345180153846741, -0.3405822217464447, 0.22327540814876556, 0.02809770777821541, 0.17404714226722717, 0.22873322665691376, -0.3915692865848541, -0.39005470275878906, -0.4675980806350708, 0.44798821210861206, -0.31790846586227417, -0.21734853088855743, 0.2172199934720993, -0.3485357165336609, 0.1241735890507698, -0.6933310031890869, -0.09649480134248734, 0.24731965363025665, -0.20421941578388214, 0.13033808767795563, -0.4282769560813904, -0.22173112630844116, 0.08912057429552078, -0.3927532434463501, 0.3523387908935547, 0.36073970794677734, -0.036902282387018204, 0.5880261063575745, -0.29945725202560425, -0.40845751762390137, -0.3265145421028137, 0.370391309261322, -0.3553546965122223, 0.5133077502250671, 0.1800842434167862, -0.34683868288993835, 0.28811708092689514, 0.3033837080001831, -0.4140017628669739, 0.4362258017063141, 0.3689269423484802, 0.3121638596057892, -0.3287503123283386, -0.15226924419403076, -0.17191028594970703, -0.10683685541152954, 0.34219542145729065, 0.34955963492393494, 0.22892920672893524, -0.20123478770256042, -0.3934169411659241, 0.25449705123901367, -0.541163444519043, 0.21640898287296295, 0.19343338906764984, -0.14020974934101105, 0.010480044409632683, -0.24229897558689117, -0.4682120084762573, 0.02336042746901512, 0.039344485849142075, 0.42446646094322205, -0.3173693120479584, 0.23609045147895813, 0.20335273444652557, -0.19347436726093292, -0.05698636546730995, 0.17990583181381226, 0.30915674567222595, 0.3115670382976532, 0.4147215485572815, -0.38558056950569153, -0.12379863113164902, 0.025996098294854164, -0.3010733425617218, 0.03275908902287483, -0.6039671897888184, 0.06267470866441727, -0.012677585706114769, 0.3484704792499542, 0.24301587045192719, -0.40881243348121643, -0.16732162237167358, 0.190901979804039, -0.5619192719459534, 0.30009278655052185, -0.43359509110450745, 0.26643550395965576, 0.5083268880844116, 0.3491555452346802, 0.4731655716896057, 0.6301924586296082, -0.8111121654510498, 0.6473397016525269, -0.001451796037144959, 0.3649038076400757, -0.6002859473228455, -0.41925248503685, 0.05584913119673729, 0.7823511362075806, 0.421135276556015, 0.5779385566711426, -0.49475061893463135, 0.5293950438499451, -0.45432502031326294, -0.680946946144104, -0.3506624102592468, -0.21028658747673035, 0.4775547385215759, 0.25049126148223877, 0.2707470655441284, -0.3469635546207428, 0.5959001779556274, -0.5623777508735657, -0.6334168910980225, 0.4096938669681549, -0.3921370208263397, -0.27649807929992676, 0.4424516260623932, -0.28308066725730896, -0.22009265422821045, -0.386872798204422, 0.5130718350410461, 0.5702601075172424, 0.7469420433044434, -0.09606175124645233, -0.4271978437900543], + b2: &[-0.07522959262132645, 0.3644154667854309, -0.25166040658950806, -0.12973527610301971, 0.25026997923851013, -0.2794199585914612, -0.17614373564720154], + } + } diff --git a/vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.txt b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.txt new file mode 100644 index 000000000..201cd54ce --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.txt @@ -0,0 +1,1701 @@ +armv7neon_mmm_f32_8x6_generic 17 128 19 0.00006235573582381347 +armv7neon_mmm_f32_8x4_cortexa7 23 32 3 0.000006021597781788675 +armv7neon_mmm_f32_8x6_cortexa7 17 128 7 0.000041163109831630036 +armv7neon_mmm_f32_8x6_generic 9 4 5 0.0000020753617625129768 +generic_f32_4x4 13 4 3 0.000003220368712907131 +armv7neon_mmm_f32_8x4_cortexa7 9 128 3 0.000011219671010907719 +armv7neon_mmm_f32_8x6_generic 24 4 12 0.00000416032880372066 +armv7neon_mmm_f32_8x6_cortexa9 15 128 12 0.00002134037524275856 +armv7neon_mmm_f32_8x4_cortexa7 16 32 7 0.000007487730700761545 +armv7neon_mmm_f32_8x4_generic 25 128 4 0.000015723210051937644 +armv7neon_mmm_f32_8x6_cortexa9 15 128 13 0.000031729639665247244 +armv7neon_mmm_f32_8x4_cortexa7 17 128 5 0.00003233807616782481 +generic_f32_4x4 5 4 9 0.000004114340189363069 +armv7neon_mmm_f32_8x4_cortexa9 7 4 12 0.0000026805076801341797 +armv7neon_mmm_f32_8x6_cortexa7 8 4 7 0.0000020191503624854738 +generic_f32_4x4 5 32 4 0.0000030555103653558445 +armv7neon_mmm_f32_8x4_cortexa7 8 128 12 0.000016128094616247412 +generic_f32_4x4 13 128 8 0.000029057855790622486 +armv7neon_mmm_f32_8x6_cortexa7 23 128 5 0.00002125083853467516 +generic_f32_4x4 5 128 4 0.000007724128084704853 +generic_f32_4x4 9 128 5 0.000022305019807747277 +armv7neon_mmm_f32_8x6_generic 23 32 11 0.000011268391904458938 +armv7neon_mmm_f32_8x4_generic 7 4 12 0.0000026796237449338948 +armv7neon_mmm_f32_8x4_cortexa9 8 32 11 0.000004490236277399054 +armv7neon_mmm_f32_8x6_cortexa7 23 4 7 0.000005431543985839428 +armv7neon_mmm_f32_8x4_cortexa9 7 32 11 0.000004893302315959357 +armv7neon_mmm_f32_8x6_generic 17 4 6 0.0000024671248311817606 +armv7neon_mmm_f32_8x6_cortexa7 7 4 7 0.0000023786534435590155 +armv7neon_mmm_f32_8x6_cortexa9 23 32 7 0.000011142179078429717 +armv7neon_mmm_f32_8x6_cortexa7 23 128 18 0.00006187753967619513 +armv7neon_mmm_f32_8x4_cortexa9 17 4 13 0.000007583246734362711 +armv7neon_mmm_f32_8x4_cortexa7 25 32 8 0.000013742616132230942 +armv7neon_mmm_f32_8x4_cortexa9 15 128 13 0.00003174823725370825 +armv7neon_mmm_f32_8x6_cortexa7 8 32 17 0.0000068525292311713665 +armv7neon_mmm_f32_8x4_generic 23 4 12 0.000005946433902061308 +armv7neon_mmm_f32_8x4_generic 7 32 12 0.000004913908866329809 +armv7neon_mmm_f32_8x4_generic 9 4 5 0.0000029635862825605913 +armv7neon_mmm_f32_8x6_generic 24 128 7 0.00003097026633557495 +armv7neon_mmm_f32_8x4_generic 24 4 13 0.000007343587112941961 +armv7neon_mmm_f32_8x4_generic 23 4 8 0.000004129384710709428 +generic_f32_4x4 11 128 8 0.00002206478635932785 +armv7neon_mmm_f32_8x4_generic 8 4 9 0.000002203244153911471 +armv7neon_mmm_f32_8x4_cortexa9 17 32 3 0.000004810722960669039 +armv7neon_mmm_f32_8x4_cortexa7 7 128 12 0.000016707563395180685 +armv7neon_mmm_f32_8x6_cortexa7 7 4 12 0.000002638802872509435 +armv7neon_mmm_f32_8x4_cortexa7 17 128 9 0.0000480527815279415 +armv7neon_mmm_f32_8x4_cortexa7 7 128 7 0.000011282871511736955 +armv7neon_mmm_f32_8x4_cortexa9 16 4 5 0.000002910382588603331 +armv7neon_mmm_f32_8x4_generic 17 128 13 0.00004671426916376633 +armv7neon_mmm_f32_8x4_cortexa7 9 128 8 0.00002150638840328972 +armv7neon_mmm_f32_8x4_cortexa9 25 32 12 0.000015994893141833952 +armv7neon_mmm_f32_8x4_cortexa7 23 32 11 0.000016244020387064702 +armv7neon_mmm_f32_8x6_cortexa9 7 4 11 0.000002387141196478666 +armv7neon_mmm_f32_8x6_cortexa7 7 4 5 0.0000013875129728218855 +armv7neon_mmm_f32_8x4_cortexa7 15 32 4 0.0000039730943727200555 +armv7neon_mmm_f32_8x4_cortexa7 24 32 3 0.000006034875853921332 +armv7neon_mmm_f32_8x4_cortexa9 8 4 5 0.0000016776748569042105 +armv7neon_mmm_f32_8x6_cortexa7 16 4 18 0.0000045013629937264245 +armv7neon_mmm_f32_8x4_cortexa9 8 128 8 0.000008024678845849025 +armv7neon_mmm_f32_8x6_cortexa9 25 4 18 0.000008301730396887936 +armv7neon_mmm_f32_8x4_generic 7 32 11 0.000004892898505691065 +armv7neon_mmm_f32_8x4_cortexa9 16 128 4 0.000008030995808345325 +armv7neon_mmm_f32_8x4_cortexa7 16 32 3 0.000004194758783711011 +armv7neon_mmm_f32_8x4_generic 25 32 3 0.000006291923472603681 +generic_f32_4x4 3 32 13 0.000005952451348897524 +armv7neon_mmm_f32_8x6_cortexa9 23 32 19 0.000021623159632105592 +armv7neon_mmm_f32_8x4_generic 9 4 7 0.0000030054177268153855 +armv7neon_mmm_f32_8x4_cortexa9 7 128 3 0.0000044229477614846184 +armv7neon_mmm_f32_8x4_generic 25 32 12 0.0000159741241286288 +armv7neon_mmm_f32_8x6_generic 8 32 5 0.000002351046891403544 +armv7neon_mmm_f32_8x6_generic 8 128 17 0.00001572220287732098 +armv7neon_mmm_f32_8x6_cortexa9 25 4 13 0.00000878947090986632 +armv7neon_mmm_f32_8x6_cortexa7 9 128 18 0.00004088792544048485 +armv7neon_mmm_f32_8x4_generic 9 128 8 0.000015874731663155136 +armv7neon_mmm_f32_8x4_generic 16 128 5 0.000015914646023810615 +armv7neon_mmm_f32_8x6_cortexa7 17 32 18 0.000019117240551030633 +armv7neon_mmm_f32_8x4_cortexa9 7 4 11 0.0000026740706182831685 +armv7neon_mmm_f32_8x4_cortexa7 9 128 13 0.00004271080548146737 +armv7neon_mmm_f32_8x6_generic 16 128 11 0.000020976534312868846 +armv7neon_mmm_f32_8x4_cortexa7 16 128 7 0.00002177826510527881 +armv7neon_mmm_f32_8x4_generic 15 32 7 0.000006163372672989967 +armv7neon_mmm_f32_8x4_generic 9 128 11 0.000023697623425846144 +armv7neon_mmm_f32_8x6_cortexa7 16 4 6 0.0000018480490329829836 +armv7neon_mmm_f32_8x4_cortexa9 25 4 7 0.00000554548266542983 +armv7neon_mmm_f32_8x6_generic 15 128 13 0.00003163198682165344 +armv7neon_mmm_f32_8x6_cortexa7 25 32 18 0.000025184955719578352 +armv7neon_mmm_f32_8x6_generic 8 4 6 0.000001113907974305815 +armv7neon_mmm_f32_8x6_cortexa7 16 32 5 0.0000050492542920704264 +armv7neon_mmm_f32_8x6_generic 8 32 19 0.000007097287986832066 +armv7neon_mmm_f32_8x6_cortexa9 25 32 5 0.000007806390304931795 +armv7neon_mmm_f32_8x6_cortexa9 15 128 11 0.000021399956173572563 +armv7neon_mmm_f32_8x6_generic 16 32 6 0.000003721529767067075 +armv7neon_mmm_f32_8x4_cortexa7 17 32 5 0.000010843745704621824 +armv7neon_mmm_f32_8x4_generic 7 128 7 0.00000843066632668338 +armv7neon_mmm_f32_8x4_generic 23 32 13 0.000016850629087203773 +armv7neon_mmm_f32_8x4_cortexa9 16 32 9 0.000008424770761452617 +armv7neon_mmm_f32_8x4_cortexa9 15 32 13 0.000011652714145510248 +armv7neon_mmm_f32_8x4_generic 23 128 5 0.000023910719482642124 +armv7neon_mmm_f32_8x4_cortexa9 23 128 3 0.000012498528953898262 +armv7neon_mmm_f32_8x4_generic 16 32 8 0.000005593957529271591 +armv7neon_mmm_f32_8x6_cortexa7 7 128 13 0.000021419187824431687 +armv7neon_mmm_f32_8x6_cortexa9 7 128 6 0.000005900049478814555 +armv7neon_mmm_f32_8x4_cortexa7 24 32 11 0.000015824034451655578 +armv7neon_mmm_f32_8x6_cortexa7 16 4 17 0.000004987657461648502 +armv7neon_mmm_f32_8x4_cortexa7 23 32 8 0.000010735439695395534 +armv7neon_mmm_f32_8x4_cortexa9 25 4 5 0.000005392929349101573 +armv7neon_mmm_f32_8x4_generic 25 4 5 0.000005418182338760064 +armv7neon_mmm_f32_8x4_generic 7 128 9 0.000012382199177609924 +armv7neon_mmm_f32_8x4_cortexa9 9 32 9 0.000008579188782470308 +armv7neon_mmm_f32_8x4_generic 8 128 12 0.000011801915167566902 +armv7neon_mmm_f32_8x4_cortexa9 23 4 8 0.0000041152642927900605 +armv7neon_mmm_f32_8x4_generic 7 4 11 0.0000026764041239252324 +generic_f32_4x4 13 128 12 0.00004319526080896362 +generic_f32_4x4 5 32 5 0.000005725142687375275 +generic_f32_4x4 5 128 12 0.00002200938035136394 +armv7neon_mmm_f32_8x4_generic 23 4 5 0.000004369620528795486 +armv7neon_mmm_f32_8x6_generic 9 4 11 0.000003424705713246475 +armv7neon_mmm_f32_8x6_generic 17 32 12 0.000010395428642618952 +generic_f32_4x4 13 32 11 0.000015867743214986118 +armv7neon_mmm_f32_8x6_cortexa9 9 32 17 0.000010850143335646025 +armv7neon_mmm_f32_8x6_cortexa9 23 4 6 0.0000028251957000612876 +armv7neon_mmm_f32_8x6_cortexa9 9 128 7 0.000020914788448206477 +armv7neon_mmm_f32_8x4_cortexa7 15 128 11 0.000032581738066809436 +armv7neon_mmm_f32_8x4_cortexa9 25 128 3 0.000016370587474295946 +armv7neon_mmm_f32_8x4_cortexa7 23 4 4 0.0000024826703257808083 +armv7neon_mmm_f32_8x4_generic 9 128 4 0.000008108848864529598 +armv7neon_mmm_f32_8x6_cortexa9 25 128 5 0.00002136490649839919 +armv7neon_mmm_f32_8x6_generic 23 128 19 0.00006253554336626349 +generic_f32_4x4 12 4 12 0.000005236763691820686 +armv7neon_mmm_f32_8x6_cortexa9 8 4 7 0.0000019309843288986318 +armv7neon_mmm_f32_8x4_cortexa7 15 4 5 0.000003357403897484634 +armv7neon_mmm_f32_8x6_cortexa9 24 4 6 0.0000024072433754401144 +armv7neon_mmm_f32_8x6_cortexa7 25 128 7 0.000054619400677372974 +armv7neon_mmm_f32_8x6_generic 7 4 19 0.000004265433096040261 +generic_f32_4x4 7 4 12 0.0000041235655802886916 +armv7neon_mmm_f32_8x6_generic 17 128 17 0.000046385914441781564 +armv7neon_mmm_f32_8x4_generic 8 32 7 0.0000032122117166780275 +armv7neon_mmm_f32_8x4_cortexa9 16 4 7 0.0000030113926906216977 +generic_f32_4x4 13 128 13 0.00005791151370731608 +armv7neon_mmm_f32_8x4_cortexa7 9 32 9 0.000010771893810221605 +armv7neon_mmm_f32_8x4_cortexa7 23 4 7 0.000004820473947807521 +armv7neon_mmm_f32_8x4_cortexa9 24 4 13 0.00000731015216867438 +generic_f32_4x4 3 4 5 0.0000018607872137198219 +armv7neon_mmm_f32_8x4_generic 15 32 9 0.000008854455084657494 +armv7neon_mmm_f32_8x6_cortexa9 15 32 6 0.000004174355801441308 +armv7neon_mmm_f32_8x6_cortexa9 16 4 5 0.000002250377451226225 +armv7neon_mmm_f32_8x6_cortexa9 16 32 19 0.000013757586520904355 +armv7neon_mmm_f32_8x6_cortexa9 24 128 11 0.000031276431090202235 +armv7neon_mmm_f32_8x4_generic 25 128 7 0.00003164406261833124 +armv7neon_mmm_f32_8x4_generic 24 4 4 0.0000020584132727266 +generic_f32_4x4 7 128 11 0.000022350186275600858 +armv7neon_mmm_f32_8x4_cortexa9 23 4 4 0.0000022753692017814563 +armv7neon_mmm_f32_8x6_cortexa9 17 32 19 0.00002069369907930334 +generic_f32_4x4 4 128 13 0.00001478326569151377 +armv7neon_mmm_f32_8x6_cortexa9 23 32 18 0.00001626278262596922 +armv7neon_mmm_f32_8x6_generic 7 128 18 0.000016667585395966858 +armv7neon_mmm_f32_8x4_cortexa9 8 128 3 0.000004452778028984887 +armv7neon_mmm_f32_8x4_cortexa7 16 128 13 0.000042499574729319346 +generic_f32_4x4 3 32 11 0.000004620575096918176 +generic_f32_4x4 4 4 8 0.0000015603089093623964 +armv7neon_mmm_f32_8x6_cortexa9 15 32 7 0.00000772763531212548 +armv7neon_mmm_f32_8x4_cortexa7 9 4 11 0.000004505536773333099 +armv7neon_mmm_f32_8x6_cortexa9 15 32 18 0.000011416109977905379 +armv7neon_mmm_f32_8x4_cortexa7 24 128 12 0.00004730343110835064 +armv7neon_mmm_f32_8x4_cortexa9 8 4 7 0.000001726865000473268 +armv7neon_mmm_f32_8x4_cortexa9 17 4 12 0.00000554999550214869 +armv7neon_mmm_f32_8x4_cortexa9 24 32 3 0.000004919761516416024 +armv7neon_mmm_f32_8x4_cortexa7 7 4 7 0.0000020327105105148146 +generic_f32_4x4 8 32 11 0.000008129741741591272 +armv7neon_mmm_f32_8x6_cortexa9 17 128 6 0.000015685032106978185 +armv7neon_mmm_f32_8x6_cortexa7 24 4 13 0.0000069628322548194484 +generic_f32_4x4 11 32 7 0.000008418658676221946 +armv7neon_mmm_f32_8x4_cortexa9 23 32 13 0.000016859244394935546 +armv7neon_mmm_f32_8x6_cortexa7 15 4 19 0.000007361880271251662 +armv7neon_mmm_f32_8x6_cortexa7 25 128 19 0.00010862057908423823 +generic_f32_4x4 7 128 5 0.00001508121452285347 +armv7neon_mmm_f32_8x4_generic 8 4 12 0.0000020475145659989298 +armv7neon_mmm_f32_8x6_generic 23 32 6 0.000005742234510888845 +armv7neon_mmm_f32_8x6_cortexa9 8 32 7 0.000003926599359988243 +armv7neon_mmm_f32_8x4_cortexa7 16 32 5 0.00000738840656931522 +generic_f32_4x4 3 128 5 0.000007875053042737703 +armv7neon_mmm_f32_8x6_cortexa9 9 128 5 0.000010860557674890965 +armv7neon_mmm_f32_8x6_generic 15 32 11 0.000007797932920988772 +generic_f32_4x4 8 4 3 0.0000019148150797610994 +armv7neon_mmm_f32_8x4_generic 23 32 9 0.000012836974653040903 +armv7neon_mmm_f32_8x4_cortexa7 25 4 3 0.0000035757992765103294 +armv7neon_mmm_f32_8x4_generic 7 32 8 0.0000034181328210189654 +armv7neon_mmm_f32_8x6_cortexa9 25 32 7 0.000014158102081231813 +armv7neon_mmm_f32_8x6_generic 9 4 18 0.000004486229240869606 +armv7neon_mmm_f32_8x4_cortexa7 8 4 4 0.0000010559381010362017 +armv7neon_mmm_f32_8x4_cortexa9 25 32 3 0.000006292048691193805 +armv7neon_mmm_f32_8x4_cortexa9 24 4 11 0.000005847758703659802 +armv7neon_mmm_f32_8x4_cortexa9 7 128 12 0.00001242120677392976 +armv7neon_mmm_f32_8x4_cortexa7 16 128 9 0.00003210848238315819 +armv7neon_mmm_f32_8x4_cortexa9 24 128 7 0.00002382518082369683 +armv7neon_mmm_f32_8x4_generic 24 32 3 0.0000049198649021802165 +armv7neon_mmm_f32_8x4_cortexa9 25 4 11 0.000007781278370753281 +armv7neon_mmm_f32_8x6_cortexa7 17 32 12 0.000012940679584474003 +armv7neon_mmm_f32_8x6_cortexa7 16 32 17 0.000013190821660739104 +armv7neon_mmm_f32_8x6_cortexa7 25 4 19 0.000012097338566670792 +armv7neon_mmm_f32_8x6_generic 16 32 18 0.000010138504593611297 +armv7neon_mmm_f32_8x4_generic 9 128 5 0.000015947315101595927 +armv7neon_mmm_f32_8x6_cortexa9 23 4 5 0.0000031081830446744932 +armv7neon_mmm_f32_8x6_cortexa9 24 128 7 0.00003103549849311001 +armv7neon_mmm_f32_8x4_cortexa7 25 4 13 0.000010646545231619916 +generic_f32_4x4 9 128 12 0.00003260110763311798 +armv7neon_mmm_f32_8x6_cortexa9 23 4 17 0.000007631985122319121 +armv7neon_mmm_f32_8x4_cortexa9 8 128 13 0.00001574791956845722 +armv7neon_mmm_f32_8x6_cortexa9 23 4 18 0.000007303182081264746 +armv7neon_mmm_f32_8x4_cortexa7 24 128 13 0.00006349168486923089 +armv7neon_mmm_f32_8x4_generic 23 32 11 0.000012994166844502801 +generic_f32_4x4 7 32 4 0.0000031041482961797043 +armv7neon_mmm_f32_8x4_cortexa9 9 32 7 0.000005959447070414145 +armv7neon_mmm_f32_8x4_cortexa7 23 128 11 0.00004845207607490223 +armv7neon_mmm_f32_8x6_cortexa7 17 4 11 0.0000052419075338197185 +generic_f32_4x4 8 4 13 0.00000500809427570026 +armv7neon_mmm_f32_8x4_generic 24 32 7 0.000008731713037367055 +armv7neon_mmm_f32_8x6_cortexa9 25 128 13 0.00006164900543131858 +armv7neon_mmm_f32_8x6_cortexa9 17 4 7 0.000004817790997463295 +armv7neon_mmm_f32_8x6_cortexa9 8 32 19 0.000007141501791335855 +armv7neon_mmm_f32_8x4_cortexa9 16 4 4 0.000001530544742707957 +armv7neon_mmm_f32_8x6_generic 25 4 17 0.000008895187762533163 +armv7neon_mmm_f32_8x4_cortexa7 23 4 3 0.0000028668169166829235 +armv7neon_mmm_f32_8x6_cortexa7 24 128 11 0.00004128096933839809 +generic_f32_4x4 7 4 13 0.000005447538386200191 +armv7neon_mmm_f32_8x6_cortexa9 9 128 17 0.0000311696244066944 +armv7neon_mmm_f32_8x6_cortexa7 25 4 17 0.000009561336620584211 +armv7neon_mmm_f32_8x4_generic 16 128 8 0.000015609467528565242 +armv7neon_mmm_f32_8x6_cortexa9 7 4 6 0.0000015274572782560943 +armv7neon_mmm_f32_8x6_cortexa9 7 4 19 0.000004327266570693739 +generic_f32_4x4 4 128 3 0.0000042186208925339794 +armv7neon_mmm_f32_8x6_cortexa9 16 32 6 0.000003783089782648409 +armv7neon_mmm_f32_8x4_generic 24 4 12 0.000005240725524660556 +armv7neon_mmm_f32_8x6_generic 24 4 5 0.000003075012528242585 +armv7neon_mmm_f32_8x4_cortexa7 16 128 3 0.000011341419957358612 +generic_f32_4x4 13 32 7 0.000010885101829856154 +generic_f32_4x4 8 128 13 0.00002903953507476824 +armv7neon_mmm_f32_8x6_cortexa9 7 32 12 0.000004528441801230068 +armv7neon_mmm_f32_8x4_cortexa9 8 128 9 0.000011968397967338213 +generic_f32_4x4 9 4 11 0.000005879906070039223 +armv7neon_mmm_f32_8x4_generic 9 128 9 0.000023684049431562338 +armv7neon_mmm_f32_8x6_cortexa7 23 128 19 0.00008252903643379173 +armv7neon_mmm_f32_8x6_generic 23 128 7 0.000031381294348434004 +armv7neon_mmm_f32_8x4_generic 15 4 13 0.000005669760296114337 +armv7neon_mmm_f32_8x4_generic 15 4 11 0.000004507200935249084 +armv7neon_mmm_f32_8x4_generic 15 4 3 0.0000019251390208258458 +generic_f32_4x4 9 32 7 0.00000832450452449881 +generic_f32_4x4 9 4 9 0.000005828318767488796 +armv7neon_mmm_f32_8x4_cortexa7 16 4 3 0.000002093301960254744 +armv7neon_mmm_f32_8x4_generic 15 128 12 0.000023812063445324994 +armv7neon_mmm_f32_8x6_cortexa7 7 4 19 0.000004468409864573403 +generic_f32_4x4 8 4 7 0.000002955604897568221 +armv7neon_mmm_f32_8x6_cortexa9 9 32 12 0.000007238338427088594 +armv7neon_mmm_f32_8x6_generic 16 32 12 0.00000694068903036478 +armv7neon_mmm_f32_8x6_cortexa7 25 128 6 0.000027308370544312844 +armv7neon_mmm_f32_8x6_cortexa9 24 4 17 0.000006795821587814609 +armv7neon_mmm_f32_8x6_cortexa7 15 128 12 0.00002803261400606977 +generic_f32_4x4 9 32 8 0.000008044423595514706 +armv7neon_mmm_f32_8x4_generic 23 128 9 0.00003548379543389711 +armv7neon_mmm_f32_8x4_cortexa7 17 32 3 0.000005926615429744039 +armv7neon_mmm_f32_8x4_generic 8 128 7 0.000008235490634332163 +armv7neon_mmm_f32_8x4_cortexa7 9 4 12 0.000004255018272123806 +armv7neon_mmm_f32_8x4_cortexa9 16 32 11 0.000008524184894246237 +armv7neon_mmm_f32_8x4_cortexa9 24 32 4 0.000004264845803875226 +generic_f32_4x4 3 4 11 0.000002555428269031667 +armv7neon_mmm_f32_8x4_cortexa7 24 4 4 0.0000022261643758837446 +armv7neon_mmm_f32_8x4_cortexa7 15 128 3 0.000011306220407671872 +armv7neon_mmm_f32_8x6_generic 8 4 18 0.000002337607386621077 +armv7neon_mmm_f32_8x4_cortexa9 25 4 4 0.0000026788269418985845 +generic_f32_4x4 11 4 7 0.000004308040885452502 +armv7neon_mmm_f32_8x4_cortexa9 15 128 4 0.000008223517905022214 +generic_f32_4x4 5 32 11 0.000008288284199113315 +armv7neon_mmm_f32_8x6_cortexa9 9 4 12 0.0000032467302759009237 +armv7neon_mmm_f32_8x4_generic 24 128 7 0.00002382735987416326 +armv7neon_mmm_f32_8x6_generic 24 32 13 0.000015487434262812744 +armv7neon_mmm_f32_8x6_generic 17 128 13 0.00004629753684559049 +armv7neon_mmm_f32_8x6_cortexa7 24 128 13 0.00006117633525152988 +armv7neon_mmm_f32_8x6_generic 24 4 19 0.000008326315174295047 +armv7neon_mmm_f32_8x6_cortexa7 17 32 7 0.000013248073815175261 +armv7neon_mmm_f32_8x4_cortexa9 8 4 9 0.0000022033600613983073 +generic_f32_4x4 13 4 12 0.000007132280989099253 +armv7neon_mmm_f32_8x4_cortexa7 16 128 4 0.000010902014380146801 +armv7neon_mmm_f32_8x6_cortexa7 9 4 6 0.0000019491119088955803 +generic_f32_4x4 3 4 7 0.0000019042920757312053 +armv7neon_mmm_f32_8x4_generic 7 32 5 0.0000033764800411714497 +armv7neon_mmm_f32_8x6_cortexa7 23 4 19 0.000010215694576435441 +armv7neon_mmm_f32_8x4_generic 17 4 3 0.0000025969094160852644 +generic_f32_4x4 5 4 11 0.0000041349188443565096 +armv7neon_mmm_f32_8x6_cortexa7 9 32 17 0.000013295032359470915 +armv7neon_mmm_f32_8x4_cortexa7 15 128 13 0.00004313466302520057 +armv7neon_mmm_f32_8x4_cortexa9 15 4 8 0.0000030055981545968782 +armv7neon_mmm_f32_8x6_generic 16 4 12 0.0000029570565051151263 +armv7neon_mmm_f32_8x6_cortexa9 8 4 11 0.000002014524477563071 +armv7neon_mmm_f32_8x4_cortexa9 8 4 3 0.0000012065261711370315 +armv7neon_mmm_f32_8x4_cortexa7 7 32 7 0.000004136959156912223 +armv7neon_mmm_f32_8x4_generic 17 4 9 0.000005894535633035915 +generic_f32_4x4 9 4 13 0.00000752174902729491 +armv7neon_mmm_f32_8x6_cortexa9 7 32 5 0.000002364950726746234 +generic_f32_4x4 13 32 4 0.000005521387429490463 +armv7neon_mmm_f32_8x4_generic 8 128 3 0.000004474358471860054 +armv7neon_mmm_f32_8x4_cortexa7 23 4 12 0.0000063640165444956975 +armv7neon_mmm_f32_8x4_generic 7 32 4 0.0000019290673256470733 +armv7neon_mmm_f32_8x4_generic 17 32 12 0.000012180617018748219 +armv7neon_mmm_f32_8x6_generic 17 4 11 0.000004899442085076861 +armv7neon_mmm_f32_8x4_cortexa7 24 128 7 0.000032404931744711066 +armv7neon_mmm_f32_8x4_cortexa9 16 32 12 0.000008099332243893017 +armv7neon_mmm_f32_8x6_generic 9 128 18 0.00003081307981321012 +armv7neon_mmm_f32_8x6_generic 15 128 11 0.00002132277281707897 +armv7neon_mmm_f32_8x6_cortexa7 7 128 6 0.000007540283032400861 +armv7neon_mmm_f32_8x6_generic 25 32 12 0.000013597212519317418 +armv7neon_mmm_f32_8x6_generic 23 4 17 0.000007516814516170969 +armv7neon_mmm_f32_8x6_cortexa7 24 32 11 0.000013420883457543118 +generic_f32_4x4 9 32 11 0.000012087233281631852 +armv7neon_mmm_f32_8x6_cortexa7 24 4 17 0.000007204721001644005 +armv7neon_mmm_f32_8x6_cortexa7 25 32 7 0.000017455571757923716 +armv7neon_mmm_f32_8x4_cortexa9 23 32 11 0.000012992255106920675 +armv7neon_mmm_f32_8x4_cortexa9 9 4 9 0.000004090278120441601 +armv7neon_mmm_f32_8x4_cortexa7 23 32 13 0.000021193261455653347 +armv7neon_mmm_f32_8x6_cortexa7 24 32 19 0.00002532279550192562 +armv7neon_mmm_f32_8x4_generic 25 128 11 0.00004692401023814237 +armv7neon_mmm_f32_8x4_cortexa9 17 32 7 0.000008780640229470475 +armv7neon_mmm_f32_8x4_cortexa7 9 128 9 0.00003233195046796576 +armv7neon_mmm_f32_8x4_cortexa7 23 128 5 0.000032464808470770534 +armv7neon_mmm_f32_8x4_cortexa9 9 4 5 0.00000293936322263601 +armv7neon_mmm_f32_8x6_cortexa7 25 32 6 0.000008732005752739124 +armv7neon_mmm_f32_8x6_cortexa9 8 4 5 0.0000013801093771363888 +armv7neon_mmm_f32_8x6_cortexa7 16 128 6 0.000013837384210811002 +armv7neon_mmm_f32_8x4_generic 23 32 8 0.000008530859801735756 +armv7neon_mmm_f32_8x4_cortexa7 17 4 8 0.000004162931637169246 +armv7neon_mmm_f32_8x6_generic 7 32 13 0.000006254936643865642 +armv7neon_mmm_f32_8x6_cortexa7 9 32 7 0.00000901443584736606 +armv7neon_mmm_f32_8x4_cortexa7 15 4 7 0.0000034347738366310684 +generic_f32_4x4 9 32 13 0.000015744382160735945 +armv7neon_mmm_f32_8x6_generic 9 32 18 0.000010478111403829218 +armv7neon_mmm_f32_8x6_cortexa9 23 128 11 0.00003166377905937062 +armv7neon_mmm_f32_8x4_generic 15 128 7 0.000016210803535229283 +armv7neon_mmm_f32_8x4_cortexa7 23 4 9 0.000006649300631510142 +armv7neon_mmm_f32_8x4_generic 17 32 3 0.000004810545788546653 +armv7neon_mmm_f32_8x6_generic 24 32 12 0.000010144077161084113 +armv7neon_mmm_f32_8x4_cortexa9 15 32 7 0.000006152877670399033 +armv7neon_mmm_f32_8x4_cortexa9 23 4 12 0.000005900735151833346 +armv7neon_mmm_f32_8x4_cortexa7 16 32 4 0.0000037548262311678015 +armv7neon_mmm_f32_8x6_cortexa7 23 4 18 0.000007713970848848978 +armv7neon_mmm_f32_8x6_cortexa7 16 32 6 0.000004565838055826264 +armv7neon_mmm_f32_8x4_cortexa9 25 32 5 0.000011311149331127226 +armv7neon_mmm_f32_8x6_cortexa9 16 128 18 0.00003053013458613619 +armv7neon_mmm_f32_8x6_cortexa7 15 32 17 0.000013972664044561828 +armv7neon_mmm_f32_8x4_generic 16 32 4 0.000003010794297325924 +armv7neon_mmm_f32_8x6_generic 7 128 7 0.000011021195155003848 +generic_f32_4x4 4 32 13 0.000005510533372512351 +armv7neon_mmm_f32_8x4_cortexa9 16 32 7 0.000005969619454408248 +armv7neon_mmm_f32_8x4_cortexa9 24 32 12 0.000011900133300984794 +armv7neon_mmm_f32_8x4_cortexa7 8 128 3 0.000005902896580930324 +armv7neon_mmm_f32_8x4_generic 16 32 3 0.000003439802295093395 +armv7neon_mmm_f32_8x4_cortexa7 17 32 4 0.000005503311716430611 +armv7neon_mmm_f32_8x6_generic 9 32 12 0.000007171024246490819 +armv7neon_mmm_f32_8x6_cortexa7 23 32 5 0.000007315917802939159 +armv7neon_mmm_f32_8x6_cortexa7 23 128 12 0.0000413645417403771 +armv7neon_mmm_f32_8x4_generic 16 32 7 0.000005981518355563633 +generic_f32_4x4 7 4 7 0.000003097228954014147 +armv7neon_mmm_f32_8x6_cortexa7 24 4 18 0.000006499393673193748 +armv7neon_mmm_f32_8x6_cortexa9 24 4 7 0.000004723972074276078 +armv7neon_mmm_f32_8x4_cortexa7 16 32 11 0.00001075379201180745 +generic_f32_4x4 5 32 8 0.000005591832396338314 +armv7neon_mmm_f32_8x6_cortexa7 7 4 17 0.000003519015871119777 +armv7neon_mmm_f32_8x6_generic 25 4 13 0.000008668362849414556 +armv7neon_mmm_f32_8x4_cortexa7 15 128 4 0.000011117502498464982 +armv7neon_mmm_f32_8x6_cortexa7 15 4 17 0.000005776336577667951 +armv7neon_mmm_f32_8x4_cortexa9 8 32 13 0.000005695220879131919 +armv7neon_mmm_f32_8x4_cortexa7 24 4 12 0.000005659072390762546 +armv7neon_mmm_f32_8x4_generic 23 128 4 0.00001202844464043909 +armv7neon_mmm_f32_8x6_generic 17 128 7 0.00003102141310178231 +armv7neon_mmm_f32_8x4_generic 16 128 11 0.00002360551094138009 +armv7neon_mmm_f32_8x4_cortexa7 25 128 9 0.00006388979998675838 +armv7neon_mmm_f32_8x4_generic 9 32 9 0.00000858128920884699 +armv7neon_mmm_f32_8x6_cortexa7 9 4 12 0.0000034111339123867222 +armv7neon_mmm_f32_8x6_cortexa9 17 128 11 0.00003127371093460866 +armv7neon_mmm_f32_8x4_cortexa9 15 128 11 0.00002401650595136234 +armv7neon_mmm_f32_8x6_cortexa9 23 128 12 0.00003137052216487605 +armv7neon_mmm_f32_8x6_cortexa9 16 128 17 0.00003103196066568324 +generic_f32_4x4 7 32 3 0.000003268544296254094 +armv7neon_mmm_f32_8x4_generic 9 4 8 0.0000027781499181131053 +armv7neon_mmm_f32_8x4_cortexa7 16 32 9 0.000010640273784126178 +armv7neon_mmm_f32_8x4_cortexa9 25 4 9 0.00000761410675406794 +generic_f32_4x4 3 4 12 0.00000255471395132772 +generic_f32_4x4 12 32 13 0.00001543431169651167 +armv7neon_mmm_f32_8x6_cortexa9 15 4 7 0.0000037646981265010737 +armv7neon_mmm_f32_8x4_cortexa7 7 4 9 0.0000027630250787792764 +armv7neon_mmm_f32_8x6_cortexa9 15 4 13 0.000005407861539689391 +armv7neon_mmm_f32_8x4_cortexa7 24 4 5 0.000004483592435521624 +armv7neon_mmm_f32_8x6_cortexa9 16 128 5 0.000010995701297253243 +armv7neon_mmm_f32_8x4_cortexa9 7 4 13 0.000003344708907386396 +generic_f32_4x4 3 4 4 0.0000012029323677452384 +armv7neon_mmm_f32_8x6_cortexa7 8 4 5 0.0000014134340225429588 +armv7neon_mmm_f32_8x4_cortexa7 16 4 5 0.000003166142335812665 +armv7neon_mmm_f32_8x6_cortexa9 15 4 17 0.000005534126802188086 +armv7neon_mmm_f32_8x4_generic 17 128 4 0.000011910311848497855 +armv7neon_mmm_f32_8x4_cortexa9 17 128 9 0.00003520115855849476 +armv7neon_mmm_f32_8x6_cortexa7 24 128 17 0.00006131644295051564 +armv7neon_mmm_f32_8x4_generic 8 32 12 0.000004263261337563466 +armv7neon_mmm_f32_8x4_generic 15 128 13 0.000031724034290535855 +generic_f32_4x4 13 128 4 0.000014842273639272524 +armv7neon_mmm_f32_8x6_generic 16 128 17 0.00003115527449889593 +armv7neon_mmm_f32_8x4_cortexa7 25 128 13 0.00008540107215655228 +armv7neon_mmm_f32_8x4_cortexa9 17 4 8 0.000003867693626667699 +armv7neon_mmm_f32_8x6_generic 9 4 6 0.0000018447023821962963 +armv7neon_mmm_f32_8x4_generic 15 4 5 0.0000031342530980756114 +armv7neon_mmm_f32_8x4_cortexa9 25 32 11 0.000016711914928453156 +armv7neon_mmm_f32_8x4_cortexa9 9 4 7 0.0000029971874356879734 +armv7neon_mmm_f32_8x4_cortexa9 17 32 8 0.000008299083390781581 +armv7neon_mmm_f32_8x6_generic 9 128 11 0.000020922916297729766 +armv7neon_mmm_f32_8x6_cortexa9 25 4 11 0.000006455717226984232 +generic_f32_4x4 9 128 13 0.00004380056172414816 +armv7neon_mmm_f32_8x4_generic 9 4 3 0.0000018459881520629983 +generic_f32_4x4 13 32 13 0.000020718460422326147 +generic_f32_4x4 11 128 11 0.00003310679264480642 +armv7neon_mmm_f32_8x4_cortexa7 17 128 12 0.00004762075330394028 +armv7neon_mmm_f32_8x6_cortexa9 25 4 6 0.0000031406906098450826 +armv7neon_mmm_f32_8x4_cortexa7 15 4 3 0.0000020595941028725178 +generic_f32_4x4 4 32 5 0.000003094039509493947 +armv7neon_mmm_f32_8x4_cortexa7 17 4 13 0.000008167970707514618 +armv7neon_mmm_f32_8x6_cortexa9 16 128 7 0.000020852896757305592 +armv7neon_mmm_f32_8x4_generic 24 32 12 0.000011875110028324704 +armv7neon_mmm_f32_8x6_cortexa7 23 32 6 0.0000070036066823672265 +armv7neon_mmm_f32_8x4_generic 24 32 9 0.000012379847822938195 +armv7neon_mmm_f32_8x6_generic 16 32 7 0.000007288146983219525 +armv7neon_mmm_f32_8x4_generic 8 32 4 0.0000017213389522499293 +armv7neon_mmm_f32_8x6_cortexa7 25 32 12 0.00001698633380929768 +armv7neon_mmm_f32_8x4_generic 25 32 8 0.000010843062239811664 +armv7neon_mmm_f32_8x6_generic 16 128 5 0.000010970239776322642 +armv7neon_mmm_f32_8x6_cortexa9 23 32 12 0.00001105222841817834 +armv7neon_mmm_f32_8x4_cortexa7 7 4 12 0.0000028259675178908435 +armv7neon_mmm_f32_8x4_generic 17 32 13 0.000016456626258678596 +armv7neon_mmm_f32_8x6_generic 23 32 19 0.000021489093420466062 +armv7neon_mmm_f32_8x6_cortexa9 9 32 7 0.000007388528782309879 +armv7neon_mmm_f32_8x6_generic 16 128 13 0.0000308218952023933 +armv7neon_mmm_f32_8x4_cortexa9 24 128 4 0.000011799248370378306 +armv7neon_mmm_f32_8x6_cortexa7 15 4 5 0.000002311967551514126 +armv7neon_mmm_f32_8x6_cortexa9 15 32 19 0.000015002244448339703 +armv7neon_mmm_f32_8x4_cortexa7 25 32 4 0.000007113231524121585 +armv7neon_mmm_f32_8x4_generic 9 4 9 0.0000040996991473354 +generic_f32_4x4 11 4 11 0.00000604219727392619 +armv7neon_mmm_f32_8x4_generic 23 128 12 0.0000351784079527385 +armv7neon_mmm_f32_8x4_generic 8 32 8 0.0000029996111096440974 +generic_f32_4x4 3 128 11 0.000011575521961547323 +armv7neon_mmm_f32_8x6_generic 17 4 19 0.000008636925572322338 +armv7neon_mmm_f32_8x6_cortexa7 17 32 17 0.000019651252774271738 +armv7neon_mmm_f32_8x4_cortexa7 25 32 5 0.000014279031192163009 +armv7neon_mmm_f32_8x6_cortexa9 7 128 5 0.000005734808712355398 +armv7neon_mmm_f32_8x4_cortexa9 15 4 5 0.0000031170038028798296 +armv7neon_mmm_f32_8x4_cortexa9 24 128 13 0.000046424963618177915 +armv7neon_mmm_f32_8x6_generic 16 4 5 0.0000022248730615242665 +generic_f32_4x4 8 128 9 0.0000220091711298822 +armv7neon_mmm_f32_8x6_cortexa9 24 32 17 0.000015790292495872178 +armv7neon_mmm_f32_8x6_cortexa7 25 4 12 0.000006064814099731578 +armv7neon_mmm_f32_8x4_generic 9 32 3 0.0000033141535571016215 +armv7neon_mmm_f32_8x4_cortexa7 24 32 7 0.000010955777874109968 +armv7neon_mmm_f32_8x6_generic 7 32 19 0.000008240412026502634 +armv7neon_mmm_f32_8x6_generic 15 128 12 0.000021273134898855332 +armv7neon_mmm_f32_8x4_generic 9 128 12 0.000023427967692188372 +generic_f32_4x4 9 128 4 0.000011233611698379809 +armv7neon_mmm_f32_8x4_generic 23 32 3 0.000004898603928972602 +armv7neon_mmm_f32_8x4_cortexa9 15 4 7 0.000003196236363013327 +generic_f32_4x4 13 128 9 0.000043631376927673965 +armv7neon_mmm_f32_8x6_cortexa7 8 32 13 0.000006769099621081802 +armv7neon_mmm_f32_8x6_cortexa9 16 32 11 0.000007489162129911177 +armv7neon_mmm_f32_8x6_cortexa7 15 4 11 0.000004047240611447739 +armv7neon_mmm_f32_8x6_cortexa9 8 128 12 0.000010521407486118232 +armv7neon_mmm_f32_8x6_generic 24 128 12 0.000030475350289631447 +armv7neon_mmm_f32_8x6_cortexa9 8 4 18 0.0000023963651215663894 +armv7neon_mmm_f32_8x4_generic 7 4 13 0.0000033498033347340617 +armv7neon_mmm_f32_8x6_cortexa7 24 128 5 0.000021225542006705262 +armv7neon_mmm_f32_8x6_generic 8 128 7 0.000010658035873946595 +armv7neon_mmm_f32_8x6_cortexa9 16 128 13 0.00003086535920353521 +armv7neon_mmm_f32_8x6_cortexa7 9 4 13 0.000005015620872516637 +armv7neon_mmm_f32_8x6_cortexa9 8 4 13 0.0000025412882105309308 +armv7neon_mmm_f32_8x4_cortexa9 24 32 11 0.000012564889907233448 +armv7neon_mmm_f32_8x4_generic 7 128 13 0.000016356824884577277 +armv7neon_mmm_f32_8x4_cortexa7 15 32 11 0.00001113088620100476 +armv7neon_mmm_f32_8x6_generic 23 32 18 0.000016159850846463288 +armv7neon_mmm_f32_8x6_generic 15 32 19 0.000014889331973563625 +armv7neon_mmm_f32_8x4_generic 7 128 11 0.000012426167611771446 +generic_f32_4x4 4 4 4 0.000001047996290990507 +armv7neon_mmm_f32_8x4_generic 23 4 9 0.000006177048756338135 +armv7neon_mmm_f32_8x6_cortexa9 16 4 11 0.0000034974057138003183 +armv7neon_mmm_f32_8x6_cortexa9 7 128 12 0.00001128966410991496 +armv7neon_mmm_f32_8x4_cortexa9 9 4 8 0.0000027723235031605195 +armv7neon_mmm_f32_8x4_generic 7 128 12 0.000012422711675746062 +armv7neon_mmm_f32_8x4_cortexa7 25 128 12 0.0000632498211882511 +armv7neon_mmm_f32_8x4_generic 7 128 5 0.000008380542348646574 +armv7neon_mmm_f32_8x4_cortexa9 17 4 3 0.000002593764492859534 +generic_f32_4x4 9 4 3 0.0000025396068006223777 +generic_f32_4x4 9 4 12 0.0000055571336153173335 +armv7neon_mmm_f32_8x4_cortexa7 16 128 8 0.000021301840263398342 +armv7neon_mmm_f32_8x4_generic 9 32 12 0.000008360119586696751 +armv7neon_mmm_f32_8x6_cortexa9 23 32 13 0.00001642300983789001 +armv7neon_mmm_f32_8x4_cortexa7 9 32 13 0.000014115360584179656 +armv7neon_mmm_f32_8x4_generic 7 4 4 0.0000011988761531825325 +generic_f32_4x4 4 32 12 0.000004173516705669169 +armv7neon_mmm_f32_8x6_cortexa7 8 4 17 0.000002748952290655516 +armv7neon_mmm_f32_8x6_cortexa9 15 128 17 0.000031856750398890617 +armv7neon_mmm_f32_8x4_cortexa9 17 128 4 0.00001190476277599742 +armv7neon_mmm_f32_8x6_cortexa7 23 4 17 0.000008057629471932516 +armv7neon_mmm_f32_8x4_cortexa7 24 128 9 0.00004787648599530554 +armv7neon_mmm_f32_8x4_cortexa7 23 4 5 0.000004689549145067684 +armv7neon_mmm_f32_8x6_cortexa9 17 4 18 0.000006445994468840472 +armv7neon_mmm_f32_8x6_generic 9 128 17 0.00003107927056434744 +armv7neon_mmm_f32_8x4_generic 24 128 8 0.000023288445002113136 +armv7neon_mmm_f32_8x6_cortexa9 16 32 17 0.000010725045258389392 +armv7neon_mmm_f32_8x4_cortexa7 17 128 11 0.00004814359255643903 +armv7neon_mmm_f32_8x4_generic 7 4 3 0.0000011794687664451176 +generic_f32_4x4 7 32 5 0.000005808524626140803 +armv7neon_mmm_f32_8x6_generic 24 32 11 0.000010897102444398395 +armv7neon_mmm_f32_8x4_generic 7 4 7 0.0000019250466424784192 +armv7neon_mmm_f32_8x4_cortexa9 7 32 13 0.000006310599463529341 +generic_f32_4x4 4 32 8 0.000002964574479014875 +armv7neon_mmm_f32_8x6_cortexa7 17 4 12 0.000004769238750897463 +armv7neon_mmm_f32_8x6_generic 8 32 17 0.000005580365730578937 +armv7neon_mmm_f32_8x4_cortexa9 7 4 5 0.0000018767262951495324 +armv7neon_mmm_f32_8x4_cortexa9 9 128 3 0.000008376722642527261 +generic_f32_4x4 8 32 8 0.000005427007518436356 +generic_f32_4x4 3 32 7 0.000003269319162898471 +armv7neon_mmm_f32_8x6_generic 23 128 11 0.000031615917676258355 +generic_f32_4x4 11 32 3 0.000004643343538119153 +armv7neon_mmm_f32_8x6_generic 17 4 13 0.000006685914546845586 +armv7neon_mmm_f32_8x4_cortexa9 7 128 13 0.000016356503104634956 +armv7neon_mmm_f32_8x6_generic 8 4 11 0.000001987955413940483 +armv7neon_mmm_f32_8x6_cortexa9 23 32 6 0.0000058061790469852035 +armv7neon_mmm_f32_8x6_cortexa9 17 128 12 0.00003080777419909108 +generic_f32_4x4 3 4 13 0.00000320269810797615 +armv7neon_mmm_f32_8x6_cortexa9 17 32 12 0.000010479324211722284 +armv7neon_mmm_f32_8x4_cortexa9 7 4 4 0.000001192052740790223 +armv7neon_mmm_f32_8x4_cortexa9 24 128 12 0.00003457434886740767 +armv7neon_mmm_f32_8x6_cortexa9 23 32 11 0.000011348364973457056 +armv7neon_mmm_f32_8x4_generic 16 4 12 0.000003656453112482303 +armv7neon_mmm_f32_8x6_cortexa9 16 128 6 0.000010542301986141948 +armv7neon_mmm_f32_8x4_generic 24 128 4 0.000011845830939195392 +armv7neon_mmm_f32_8x6_cortexa7 9 32 18 0.000013032529505137248 +armv7neon_mmm_f32_8x4_generic 24 128 12 0.00003455867526442596 +armv7neon_mmm_f32_8x4_generic 23 128 7 0.00002405539721722316 +armv7neon_mmm_f32_8x4_cortexa9 23 32 4 0.000004484967279906347 +armv7neon_mmm_f32_8x4_cortexa9 17 32 4 0.000004369095064954239 +armv7neon_mmm_f32_8x6_cortexa7 8 4 12 0.000001838051428755667 +armv7neon_mmm_f32_8x4_generic 15 4 8 0.000003030902158049641 +armv7neon_mmm_f32_8x4_cortexa7 8 128 9 0.000016361621231447548 +armv7neon_mmm_f32_8x6_generic 15 4 5 0.000002193007903880869 +armv7neon_mmm_f32_8x6_cortexa9 24 32 18 0.000015047680763476656 +armv7neon_mmm_f32_8x6_generic 25 128 6 0.00002065407414270236 +armv7neon_mmm_f32_8x4_generic 25 32 9 0.00001655596981137993 +armv7neon_mmm_f32_8x4_cortexa7 7 4 8 0.000002046321793305707 +armv7neon_mmm_f32_8x4_generic 24 32 5 0.000008563586932451436 +armv7neon_mmm_f32_8x6_cortexa7 25 32 19 0.00003389329143278319 +generic_f32_4x4 3 4 3 0.0000011908985597580253 +generic_f32_4x4 11 128 13 0.000043852326643092945 +armv7neon_mmm_f32_8x6_cortexa9 7 32 6 0.0000025216564013293366 +generic_f32_4x4 8 128 11 0.00002204318798460829 +generic_f32_4x4 9 32 5 0.00000828023901975494 +armv7neon_mmm_f32_8x6_generic 25 4 11 0.000006352168944400001 +armv7neon_mmm_f32_8x4_cortexa9 24 32 5 0.000008590022043400085 +armv7neon_mmm_f32_8x4_generic 8 32 3 0.0000019449019781287236 +armv7neon_mmm_f32_8x4_cortexa7 24 128 8 0.0000317811111212887 +armv7neon_mmm_f32_8x4_cortexa9 7 128 9 0.000012372639572013476 +armv7neon_mmm_f32_8x6_generic 15 128 19 0.00004250719000509339 +armv7neon_mmm_f32_8x6_cortexa7 17 4 7 0.000005084587454728339 +armv7neon_mmm_f32_8x6_generic 15 32 5 0.000004169790668084077 +generic_f32_4x4 9 128 3 0.000011555287825087273 +armv7neon_mmm_f32_8x4_cortexa7 25 128 7 0.00004317760438423037 +generic_f32_4x4 8 32 12 0.00000779706808746927 +armv7neon_mmm_f32_8x6_generic 8 32 12 0.000003719914023610946 +armv7neon_mmm_f32_8x6_cortexa7 15 128 17 0.000041839968426337905 +armv7neon_mmm_f32_8x6_cortexa7 8 128 13 0.000020713409071600263 +armv7neon_mmm_f32_8x6_cortexa9 24 4 5 0.000003104317520805454 +armv7neon_mmm_f32_8x6_cortexa9 9 4 13 0.000004778161797091213 +armv7neon_mmm_f32_8x4_cortexa9 25 128 12 0.00004644468928643539 +generic_f32_4x4 11 4 9 0.000005973042856611991 +armv7neon_mmm_f32_8x4_cortexa7 23 128 8 0.00003217939318511314 +armv7neon_mmm_f32_8x6_generic 24 4 11 0.00000490822983265492 +armv7neon_mmm_f32_8x4_generic 17 4 12 0.000005542606566473033 +armv7neon_mmm_f32_8x6_cortexa9 16 32 5 0.000004236072973244724 +armv7neon_mmm_f32_8x6_cortexa7 24 32 12 0.000012668891408140166 +armv7neon_mmm_f32_8x6_cortexa7 17 128 19 0.00008180995267293593 +armv7neon_mmm_f32_8x4_cortexa7 8 4 9 0.0000023929041278364927 +armv7neon_mmm_f32_8x4_generic 17 4 11 0.000005996779011118073 +armv7neon_mmm_f32_8x4_generic 23 32 5 0.000008783310902330428 +armv7neon_mmm_f32_8x4_cortexa9 24 4 5 0.000004126054215412369 +armv7neon_mmm_f32_8x4_cortexa7 15 4 4 0.0000018664924896343623 +generic_f32_4x4 12 32 12 0.000011419939741453591 +armv7neon_mmm_f32_8x6_cortexa9 9 128 19 0.00004140314991254094 +armv7neon_mmm_f32_8x6_cortexa7 24 4 19 0.000008976504261442944 +armv7neon_mmm_f32_8x6_cortexa7 23 32 12 0.000013517680029411935 +armv7neon_mmm_f32_8x6_generic 23 128 17 0.00004767212397281204 +armv7neon_mmm_f32_8x6_cortexa7 9 128 12 0.000027472854173113385 +generic_f32_4x4 13 4 4 0.0000027493566687604084 +armv7neon_mmm_f32_8x6_cortexa7 7 32 7 0.000005097398974457722 +armv7neon_mmm_f32_8x4_generic 25 4 3 0.0000033394241283747185 +armv7neon_mmm_f32_8x4_cortexa7 25 4 5 0.000005839267769079541 +generic_f32_4x4 8 4 9 0.0000039467461932377605 +armv7neon_mmm_f32_8x6_cortexa7 8 32 6 0.0000025355396885136015 +armv7neon_mmm_f32_8x4_cortexa7 7 4 13 0.0000035454436358925364 +generic_f32_4x4 12 128 5 0.000022119589677203316 +armv7neon_mmm_f32_8x4_cortexa9 17 32 5 0.000008619398425928128 +generic_f32_4x4 5 32 12 0.00000809048145029521 +armv7neon_mmm_f32_8x6_generic 7 32 12 0.000004497135112528872 +armv7neon_mmm_f32_8x6_cortexa9 17 128 18 0.00004612047264624732 +armv7neon_mmm_f32_8x4_cortexa9 24 128 9 0.00003503924267370847 +armv7neon_mmm_f32_8x6_cortexa9 25 128 18 0.00006126219444229572 +armv7neon_mmm_f32_8x4_cortexa9 7 32 7 0.0000034009841492082025 +armv7neon_mmm_f32_8x6_cortexa7 8 32 18 0.0000065795660386672954 +armv7neon_mmm_f32_8x4_cortexa7 9 4 7 0.0000032333021816508883 +generic_f32_4x4 11 4 5 0.000004260253310204167 +armv7neon_mmm_f32_8x4_cortexa7 15 32 12 0.00001091466040761069 +armv7neon_mmm_f32_8x6_generic 25 128 17 0.00006242813577501301 +armv7neon_mmm_f32_8x6_cortexa7 23 128 17 0.00006230757214971725 +armv7neon_mmm_f32_8x4_cortexa7 9 4 3 0.000001972675027149826 +armv7neon_mmm_f32_8x6_generic 9 32 19 0.000013969471313273173 +armv7neon_mmm_f32_8x4_cortexa7 9 4 4 0.0000017499654922061095 +generic_f32_4x4 8 4 8 0.0000026159902565290668 +armv7neon_mmm_f32_8x6_generic 23 128 12 0.000031313730389866355 +armv7neon_mmm_f32_8x4_cortexa7 15 128 7 0.000021942752305858642 +armv7neon_mmm_f32_8x6_cortexa7 9 32 6 0.000004675388926372666 +armv7neon_mmm_f32_8x6_cortexa7 8 32 5 0.000002772093015202498 +armv7neon_mmm_f32_8x6_cortexa7 9 32 19 0.000017344416985284313 +armv7neon_mmm_f32_8x4_cortexa9 8 128 7 0.000008229649908268804 +generic_f32_4x4 3 128 12 0.000011565133509989783 +generic_f32_4x4 9 32 12 0.000011785850787897203 +armv7neon_mmm_f32_8x6_cortexa9 17 128 13 0.00004651964090971228 +armv7neon_mmm_f32_8x6_cortexa9 25 32 6 0.0000071127343061349165 +armv7neon_mmm_f32_8x6_cortexa7 25 128 13 0.00008178262598187339 +armv7neon_mmm_f32_8x4_cortexa9 25 32 13 0.000021697932623977668 +armv7neon_mmm_f32_8x4_cortexa9 16 4 11 0.0000040619610195960516 +armv7neon_mmm_f32_8x4_generic 23 128 3 0.000012468771697124281 +generic_f32_4x4 11 32 13 0.000015931672245334774 +generic_f32_4x4 8 32 9 0.000008097203894273956 +armv7neon_mmm_f32_8x4_cortexa9 17 4 5 0.000004171816997731455 +armv7neon_mmm_f32_8x6_generic 15 32 17 0.000011434650801767795 +armv7neon_mmm_f32_8x4_cortexa7 25 128 3 0.000022089826731240803 +armv7neon_mmm_f32_8x4_cortexa9 17 32 9 0.000012562108538363274 +generic_f32_4x4 7 128 4 0.000007740927731735862 +armv7neon_mmm_f32_8x6_cortexa7 16 128 17 0.00004105266320070572 +armv7neon_mmm_f32_8x6_generic 17 128 5 0.000016103121774230763 +armv7neon_mmm_f32_8x6_cortexa9 25 128 19 0.0000820798385767341 +armv7neon_mmm_f32_8x4_generic 15 32 12 0.000008721032730753763 +armv7neon_mmm_f32_8x4_generic 25 4 9 0.000007623970700084407 +armv7neon_mmm_f32_8x4_generic 17 32 11 0.000012672883096785458 +armv7neon_mmm_f32_8x6_generic 16 32 19 0.000013691087754459915 +armv7neon_mmm_f32_8x6_generic 9 4 12 0.0000031605677124646883 +generic_f32_4x4 5 4 5 0.0000029759889706969683 +armv7neon_mmm_f32_8x4_generic 16 32 9 0.00000841650213766426 +generic_f32_4x4 8 32 13 0.000010487684258287977 +generic_f32_4x4 4 128 5 0.000007733485289678424 +armv7neon_mmm_f32_8x6_cortexa7 8 4 19 0.0000033370479940342483 +armv7neon_mmm_f32_8x6_cortexa7 16 4 13 0.000004820409587660123 +armv7neon_mmm_f32_8x6_cortexa7 7 128 5 0.000007382676538540774 +armv7neon_mmm_f32_8x6_cortexa9 9 4 7 0.0000034194906166596 +armv7neon_mmm_f32_8x4_generic 24 128 3 0.000012489994751068132 +armv7neon_mmm_f32_8x6_generic 17 4 5 0.0000029369592565703416 +armv7neon_mmm_f32_8x6_cortexa9 7 32 7 0.000004299364469367014 +generic_f32_4x4 8 128 8 0.000014675074170610452 +armv7neon_mmm_f32_8x6_generic 24 32 6 0.000005321602550624656 +generic_f32_4x4 8 128 3 0.000007920826245860217 +armv7neon_mmm_f32_8x6_cortexa7 9 128 17 0.000041149683356221505 +armv7neon_mmm_f32_8x6_generic 15 128 18 0.0000317697933254728 +armv7neon_mmm_f32_8x4_generic 15 32 11 0.000008972113808644295 +armv7neon_mmm_f32_8x6_generic 17 128 11 0.00003128672479118261 +armv7neon_mmm_f32_8x6_generic 16 32 5 0.000004210171223198828 +armv7neon_mmm_f32_8x4_cortexa9 23 32 7 0.000008919434958029703 +armv7neon_mmm_f32_8x6_cortexa7 23 4 11 0.000005630918362501218 +generic_f32_4x4 13 4 13 0.000009734041681688549 +armv7neon_mmm_f32_8x4_cortexa7 23 4 8 0.000004419109488183114 +armv7neon_mmm_f32_8x6_generic 17 128 18 0.00004641763657018708 +armv7neon_mmm_f32_8x6_cortexa7 23 32 7 0.000013604910503058097 +armv7neon_mmm_f32_8x4_cortexa9 16 4 8 0.0000025811126658507737 +armv7neon_mmm_f32_8x6_generic 16 4 17 0.000004664791456959476 +armv7neon_mmm_f32_8x6_cortexa9 15 128 19 0.00004234538403110192 +armv7neon_mmm_f32_8x4_cortexa7 7 128 8 0.000011300292613931975 +generic_f32_4x4 4 32 7 0.000003116445195774716 +armv7neon_mmm_f32_8x4_generic 17 128 3 0.000012383648487655847 +armv7neon_mmm_f32_8x6_cortexa9 24 32 12 0.00001022987009394415 +armv7neon_mmm_f32_8x4_cortexa9 17 32 11 0.000012684638765009217 +armv7neon_mmm_f32_8x6_cortexa9 9 128 18 0.000030893903863542745 +armv7neon_mmm_f32_8x6_cortexa7 9 4 18 0.000004852117278755586 +armv7neon_mmm_f32_8x6_cortexa9 8 128 13 0.00001571685239499597 +armv7neon_mmm_f32_8x6_cortexa9 23 4 7 0.000005166723329090456 +armv7neon_mmm_f32_8x6_cortexa9 7 128 18 0.000016699359662800567 +armv7neon_mmm_f32_8x4_generic 17 32 4 0.0000043747418052874395 +armv7neon_mmm_f32_8x4_cortexa9 25 128 13 0.00006245169757364539 +armv7neon_mmm_f32_8x4_generic 24 128 11 0.00003520805509267024 +armv7neon_mmm_f32_8x4_generic 7 4 8 0.000001940897025671589 +armv7neon_mmm_f32_8x4_cortexa9 24 32 8 0.000008105445830543554 +generic_f32_4x4 3 32 5 0.000003239638778592903 +armv7neon_mmm_f32_8x6_generic 16 128 6 0.000010481759011880018 +armv7neon_mmm_f32_8x6_cortexa9 23 4 11 0.000005380241199288215 +armv7neon_mmm_f32_8x6_cortexa7 24 4 11 0.000005233250260928121 +armv7neon_mmm_f32_8x6_cortexa9 23 4 19 0.000009671598448323651 +armv7neon_mmm_f32_8x6_cortexa7 23 4 6 0.0000029172933842802905 +armv7neon_mmm_f32_8x4_cortexa9 23 32 8 0.000008528926728389173 +armv7neon_mmm_f32_8x4_cortexa9 24 128 8 0.000023207480788900347 +generic_f32_4x4 7 32 11 0.00000844108827118094 +generic_f32_4x4 12 4 5 0.000004095794522926841 +armv7neon_mmm_f32_8x4_generic 25 4 11 0.000007781652426587433 +armv7neon_mmm_f32_8x4_generic 16 32 12 0.000008090083939378293 +armv7neon_mmm_f32_8x4_generic 9 128 7 0.000016030853255705617 +armv7neon_mmm_f32_8x6_generic 15 4 19 0.000006928708092302355 +armv7neon_mmm_f32_8x6_cortexa7 24 128 12 0.00004054881877870741 +generic_f32_4x4 8 4 4 0.0000015677362682744682 +armv7neon_mmm_f32_8x6_generic 7 128 19 0.000021822554568277356 +armv7neon_mmm_f32_8x6_cortexa9 24 4 12 0.00000423613681186459 +armv7neon_mmm_f32_8x4_cortexa7 16 4 4 0.0000016584422934049926 +armv7neon_mmm_f32_8x6_cortexa7 15 4 12 0.0000039842977137122266 +armv7neon_mmm_f32_8x4_cortexa7 7 128 4 0.000005876303683712701 +armv7neon_mmm_f32_8x6_cortexa9 7 4 5 0.000001360389294622995 +armv7neon_mmm_f32_8x4_cortexa7 24 4 9 0.000006182082152017003 +armv7neon_mmm_f32_8x6_generic 7 128 17 0.000016508734849042377 +armv7neon_mmm_f32_8x4_generic 17 128 9 0.000035220470083517575 +generic_f32_4x4 11 32 4 0.000004333160180811584 +armv7neon_mmm_f32_8x6_cortexa7 7 32 12 0.000005349730883357719 +armv7neon_mmm_f32_8x4_generic 8 32 5 0.0000031582408088385206 +armv7neon_mmm_f32_8x6_cortexa9 8 32 6 0.000002149345359409538 +generic_f32_4x4 5 32 13 0.000010767359583748404 +armv7neon_mmm_f32_8x4_generic 17 32 5 0.000008612220700720838 +generic_f32_4x4 5 4 4 0.0000016841544626715604 +armv7neon_mmm_f32_8x4_cortexa9 9 32 8 0.000005740276944412148 +armv7neon_mmm_f32_8x6_cortexa9 23 128 7 0.00003146196425036688 +armv7neon_mmm_f32_8x4_cortexa9 25 128 9 0.00004693016970364154 +armv7neon_mmm_f32_8x4_generic 24 4 8 0.0000036269745603468488 +armv7neon_mmm_f32_8x6_cortexa9 24 32 5 0.000006081990180966961 +armv7neon_mmm_f32_8x6_generic 7 4 6 0.000001511590523783866 +generic_f32_4x4 13 128 7 0.00002944642529063838 +armv7neon_mmm_f32_8x4_cortexa9 8 4 11 0.000002254110071814015 +armv7neon_mmm_f32_8x6_cortexa9 24 128 19 0.00006153515122537746 +armv7neon_mmm_f32_8x4_cortexa9 16 128 13 0.00003108122366900544 +generic_f32_4x4 11 128 9 0.000033046521573243154 +armv7neon_mmm_f32_8x4_generic 7 4 9 0.0000026137777410885136 +armv7neon_mmm_f32_8x6_cortexa7 7 4 13 0.0000034217472919863715 +armv7neon_mmm_f32_8x6_generic 16 4 11 0.000003453597254301285 +armv7neon_mmm_f32_8x4_cortexa7 17 32 11 0.000015933978694713537 +armv7neon_mmm_f32_8x6_cortexa7 23 4 12 0.0000053181832633687915 +armv7neon_mmm_f32_8x6_generic 17 32 6 0.000005447406276972202 +armv7neon_mmm_f32_8x4_cortexa7 7 32 4 0.000002302756052490624 +armv7neon_mmm_f32_8x4_cortexa9 15 4 13 0.000005662932902369066 +armv7neon_mmm_f32_8x4_cortexa7 17 32 8 0.000010470053145450968 +armv7neon_mmm_f32_8x4_generic 17 4 8 0.000003846416592921501 +armv7neon_mmm_f32_8x4_cortexa7 25 32 7 0.000014416885420743293 +armv7neon_mmm_f32_8x6_cortexa9 7 4 7 0.000002316979424271203 +armv7neon_mmm_f32_8x6_cortexa9 16 4 17 0.000004718807435203576 +generic_f32_4x4 11 32 12 0.000011882704283190954 +armv7neon_mmm_f32_8x4_cortexa7 23 32 9 0.000016122379106643026 +armv7neon_mmm_f32_8x6_cortexa7 9 32 11 0.000009102642142770847 +armv7neon_mmm_f32_8x6_generic 25 4 7 0.000006116994655414601 +armv7neon_mmm_f32_8x4_generic 23 4 3 0.000002685832196401119 +armv7neon_mmm_f32_8x6_cortexa9 25 32 11 0.000014409038834226244 +armv7neon_mmm_f32_8x6_generic 23 32 12 0.000010966793413222081 +armv7neon_mmm_f32_8x4_cortexa7 15 4 8 0.0000032350546103453447 +armv7neon_mmm_f32_8x6_cortexa7 15 128 7 0.000027943938585492828 +generic_f32_4x4 7 4 4 0.0000017389973856077342 +armv7neon_mmm_f32_8x6_generic 8 4 12 0.0000017181145762324274 +armv7neon_mmm_f32_8x4_cortexa9 15 4 11 0.000004502163536054048 +armv7neon_mmm_f32_8x6_generic 15 128 6 0.000010890629782885533 +armv7neon_mmm_f32_8x4_cortexa7 23 128 12 0.00004796960186341638 +armv7neon_mmm_f32_8x6_generic 8 32 7 0.000003896823245256928 +armv7neon_mmm_f32_8x6_cortexa9 15 128 7 0.000021306419489905895 +armv7neon_mmm_f32_8x6_cortexa7 7 4 11 0.0000024493622993257104 +armv7neon_mmm_f32_8x6_cortexa7 7 32 19 0.000009918011828071643 +armv7neon_mmm_f32_8x4_cortexa9 24 4 7 0.000004277922454765341 +armv7neon_mmm_f32_8x4_cortexa7 9 4 5 0.0000031825583861368363 +armv7neon_mmm_f32_8x6_cortexa9 8 32 11 0.000004007622996506884 +armv7neon_mmm_f32_8x6_generic 15 4 11 0.0000038091887644203395 +armv7neon_mmm_f32_8x6_cortexa7 23 128 11 0.000041674027523638683 +armv7neon_mmm_f32_8x6_cortexa7 17 4 18 0.0000068571632794369916 +armv7neon_mmm_f32_8x6_cortexa9 9 128 12 0.000020793132831426426 +armv7neon_mmm_f32_8x6_generic 25 32 19 0.000027146627303420713 +armv7neon_mmm_f32_8x6_cortexa7 9 128 19 0.00005467645413703322 +armv7neon_mmm_f32_8x6_cortexa7 8 32 11 0.0000048195055146364755 +armv7neon_mmm_f32_8x4_cortexa7 8 128 5 0.000011083761554457595 +generic_f32_4x4 7 128 12 0.000022151037088381854 +generic_f32_4x4 9 128 11 0.00003295464355640065 +armv7neon_mmm_f32_8x6_cortexa9 25 4 17 0.000009038405830025483 +generic_f32_4x4 8 32 3 0.0000032862084964825317 +armv7neon_mmm_f32_8x4_cortexa9 23 128 12 0.00003536277613858418 +armv7neon_mmm_f32_8x6_cortexa9 16 128 11 0.000021144715462112054 +armv7neon_mmm_f32_8x6_generic 17 32 5 0.000005941189535355633 +armv7neon_mmm_f32_8x6_generic 15 32 13 0.000011297818131567483 +armv7neon_mmm_f32_8x6_generic 16 32 11 0.000007448947403712638 +generic_f32_4x4 5 4 7 0.0000029999149449467176 +armv7neon_mmm_f32_8x4_generic 8 32 13 0.000005690794573889263 +armv7neon_mmm_f32_8x4_cortexa9 23 128 4 0.000012046555463702128 +armv7neon_mmm_f32_8x4_cortexa7 25 128 5 0.00004305932269978789 +armv7neon_mmm_f32_8x6_cortexa7 24 32 7 0.000013181202441221999 +armv7neon_mmm_f32_8x4_cortexa9 23 4 3 0.0000026820954481134975 +generic_f32_4x4 12 4 8 0.0000036441773649361156 +armv7neon_mmm_f32_8x6_cortexa9 24 128 17 0.000046509213530800746 +armv7neon_mmm_f32_8x6_cortexa9 8 4 6 0.0000011575433314376175 +generic_f32_4x4 3 32 4 0.0000018882099872634265 +armv7neon_mmm_f32_8x4_cortexa7 9 32 3 0.000004070589988484743 +armv7neon_mmm_f32_8x4_cortexa7 17 32 9 0.000015851096904703647 +generic_f32_4x4 13 32 12 0.00001535208709935954 +armv7neon_mmm_f32_8x6_cortexa9 25 4 19 0.000011387388616753858 +armv7neon_mmm_f32_8x6_cortexa9 23 128 5 0.00001624764366012842 +armv7neon_mmm_f32_8x4_cortexa9 9 128 12 0.00002346244933254337 +armv7neon_mmm_f32_8x4_cortexa9 25 4 12 0.000007150024568853224 +armv7neon_mmm_f32_8x6_cortexa9 15 32 12 0.000007825900269054534 +armv7neon_mmm_f32_8x4_cortexa7 25 4 12 0.000007707673254391108 +generic_f32_4x4 11 32 5 0.000008363778534607299 +armv7neon_mmm_f32_8x4_cortexa7 17 128 8 0.00003194246552397465 +armv7neon_mmm_f32_8x6_generic 8 32 6 0.0000021066884422501677 +armv7neon_mmm_f32_8x6_cortexa7 16 4 11 0.0000036716657035155366 +armv7neon_mmm_f32_8x4_generic 17 128 7 0.00002384609528431003 +armv7neon_mmm_f32_8x4_cortexa7 9 128 7 0.000021759641210777887 +armv7neon_mmm_f32_8x6_cortexa7 17 32 11 0.000013412538846571892 +generic_f32_4x4 7 32 8 0.000005690920861795519 +armv7neon_mmm_f32_8x6_generic 25 128 19 0.00008258096370056387 +armv7neon_mmm_f32_8x6_cortexa7 15 4 7 0.000003922803920269034 +armv7neon_mmm_f32_8x4_cortexa9 16 4 12 0.0000036509568763084446 +generic_f32_4x4 11 128 4 0.000011287445429820831 +armv7neon_mmm_f32_8x6_cortexa7 15 128 5 0.000014299033646853434 +armv7neon_mmm_f32_8x6_cortexa9 24 128 12 0.00003053734816197179 +armv7neon_mmm_f32_8x6_cortexa9 17 32 6 0.000005510675534687666 +armv7neon_mmm_f32_8x6_cortexa9 9 32 19 0.000014080838768872731 +generic_f32_4x4 13 128 5 0.000029375850896877852 +armv7neon_mmm_f32_8x6_cortexa7 25 32 5 0.000009454087084930695 +armv7neon_mmm_f32_8x4_generic 8 4 8 0.0000015055905623190838 +armv7neon_mmm_f32_8x4_cortexa7 16 4 13 0.00000545158088053187 +armv7neon_mmm_f32_8x6_generic 8 128 6 0.0000054873008191504395 +generic_f32_4x4 5 128 7 0.000015018624971335435 +armv7neon_mmm_f32_8x6_generic 15 128 7 0.00002123110926361701 +armv7neon_mmm_f32_8x4_cortexa7 7 32 3 0.000002283721924125051 +armv7neon_mmm_f32_8x6_generic 9 128 19 0.00004153518147749415 +armv7neon_mmm_f32_8x4_generic 9 32 11 0.000008626343403146056 +armv7neon_mmm_f32_8x6_cortexa9 9 4 6 0.0000018884740207207647 +armv7neon_mmm_f32_8x6_cortexa7 9 128 6 0.000013948466701679792 +generic_f32_4x4 11 4 12 0.000005708922503580876 +armv7neon_mmm_f32_8x6_cortexa7 24 128 6 0.000020527418679269756 +armv7neon_mmm_f32_8x4_cortexa7 8 4 11 0.0000024406051533290663 +armv7neon_mmm_f32_8x4_cortexa9 23 128 9 0.000035493958882244616 +armv7neon_mmm_f32_8x6_cortexa9 17 4 11 0.00000498803165144454 +armv7neon_mmm_f32_8x6_generic 8 32 18 0.000005320856703844532 +armv7neon_mmm_f32_8x4_cortexa9 9 128 4 0.00000810350837903274 +generic_f32_4x4 9 4 8 0.000003892676050216947 +armv7neon_mmm_f32_8x6_cortexa9 24 128 5 0.000016255452753216072 +armv7neon_mmm_f32_8x6_generic 7 4 5 0.0000013389729556607902 +armv7neon_mmm_f32_8x6_generic 17 32 19 0.000020571678410473347 +armv7neon_mmm_f32_8x6_cortexa9 7 32 13 0.0000063039500398145465 +armv7neon_mmm_f32_8x4_cortexa9 17 128 7 0.000023826784148315835 +armv7neon_mmm_f32_8x4_generic 24 4 7 0.000004274685140650307 +armv7neon_mmm_f32_8x6_cortexa9 17 32 17 0.000015955807283560996 +armv7neon_mmm_f32_8x6_cortexa7 17 4 13 0.000007179865477053954 +armv7neon_mmm_f32_8x4_cortexa9 7 32 12 0.00000488792089076881 +generic_f32_4x4 4 128 11 0.000011283150201350866 +armv7neon_mmm_f32_8x6_cortexa7 7 128 17 0.00002152414507851334 +armv7neon_mmm_f32_8x4_generic 25 32 13 0.0000216921434601138 +armv7neon_mmm_f32_8x6_generic 15 32 12 0.000007763276203034865 +generic_f32_4x4 7 128 9 0.00002231642099570999 +armv7neon_mmm_f32_8x4_generic 23 4 7 0.0000044972515990527706 +armv7neon_mmm_f32_8x4_generic 16 128 9 0.00002352868039758383 +armv7neon_mmm_f32_8x4_generic 24 32 8 0.000008105856924464553 +armv7neon_mmm_f32_8x4_cortexa7 25 4 9 0.000008239024137829502 +armv7neon_mmm_f32_8x4_generic 8 4 7 0.000001726186960943308 +armv7neon_mmm_f32_8x6_cortexa7 24 4 12 0.000004476686880431161 +armv7neon_mmm_f32_8x4_generic 8 4 5 0.0000016761254405249265 +armv7neon_mmm_f32_8x6_cortexa9 17 32 18 0.000015401920673930987 +armv7neon_mmm_f32_8x6_cortexa7 8 4 18 0.000002497401986754502 +armv7neon_mmm_f32_8x6_cortexa7 23 32 18 0.000019976606065621178 +armv7neon_mmm_f32_8x4_cortexa9 16 128 5 0.000015920451629989893 +armv7neon_mmm_f32_8x6_cortexa7 16 128 19 0.00005438554195188942 +armv7neon_mmm_f32_8x4_generic 25 32 11 0.00001669591843012648 +armv7neon_mmm_f32_8x6_cortexa9 7 32 18 0.000006518795701369289 +generic_f32_4x4 3 128 7 0.000007891377185866899 +armv7neon_mmm_f32_8x6_cortexa9 9 128 11 0.000021029476636249923 +armv7neon_mmm_f32_8x4_generic 9 32 8 0.00000573930624839856 +armv7neon_mmm_f32_8x4_cortexa9 23 32 9 0.000012851357918328979 +armv7neon_mmm_f32_8x4_cortexa7 25 128 11 0.00006422947640130408 +armv7neon_mmm_f32_8x4_cortexa7 9 128 5 0.000021690553836350107 +armv7neon_mmm_f32_8x4_cortexa7 8 128 4 0.000005680428664405247 +armv7neon_mmm_f32_8x6_cortexa9 17 32 11 0.000010949048608533127 +armv7neon_mmm_f32_8x4_cortexa9 9 32 13 0.00001119516860361631 +armv7neon_mmm_f32_8x4_cortexa9 8 128 11 0.000012017424644185006 +armv7neon_mmm_f32_8x4_cortexa9 17 4 9 0.000005874110633384251 +armv7neon_mmm_f32_8x6_cortexa7 9 4 7 0.0000035812085152963736 +armv7neon_mmm_f32_8x4_cortexa7 8 4 12 0.0000022112784369711703 +armv7neon_mmm_f32_8x6_generic 23 32 13 0.000016306177258992233 +armv7neon_mmm_f32_8x4_cortexa7 9 32 11 0.000010819099493176479 +armv7neon_mmm_f32_8x4_generic 25 128 12 0.00004665173717365099 +armv7neon_mmm_f32_8x4_cortexa9 25 4 8 0.000004903249015406008 +armv7neon_mmm_f32_8x4_generic 15 128 8 0.000016011765752869696 +generic_f32_4x4 4 4 5 0.00000173237663586998 +generic_f32_4x4 7 32 13 0.000010951743398686358 +armv7neon_mmm_f32_8x6_cortexa9 7 32 11 0.000004373180249911381 +armv7neon_mmm_f32_8x6_generic 9 4 7 0.000003340351034825373 +armv7neon_mmm_f32_8x6_cortexa7 8 128 6 0.000007164475272224828 +generic_f32_4x4 8 4 11 0.000003982256860993786 +armv7neon_mmm_f32_8x6_generic 15 128 5 0.000010930509860719574 +armv7neon_mmm_f32_8x4_cortexa9 25 128 11 0.00004712675673590545 +armv7neon_mmm_f32_8x4_cortexa7 7 4 11 0.000002822400310815202 +armv7neon_mmm_f32_8x6_cortexa7 17 32 13 0.000019470812344800868 +armv7neon_mmm_f32_8x4_cortexa7 23 32 4 0.000005619794374416781 +generic_f32_4x4 11 32 8 0.000008140965167056766 +armv7neon_mmm_f32_8x4_cortexa7 17 4 5 0.000004516405012537028 +armv7neon_mmm_f32_8x4_cortexa7 17 128 4 0.00001625158956213561 +armv7neon_mmm_f32_8x6_generic 16 128 18 0.000030484389052945016 +armv7neon_mmm_f32_8x6_cortexa7 17 128 11 0.00004127386103261296 +armv7neon_mmm_f32_8x6_cortexa7 24 32 6 0.000006586776362035912 +armv7neon_mmm_f32_8x4_generic 9 32 5 0.000005900796369564617 +armv7neon_mmm_f32_8x6_cortexa7 7 128 11 0.000014441097326114378 +armv7neon_mmm_f32_8x6_cortexa9 24 4 19 0.000008421665590898334 +armv7neon_mmm_f32_8x4_cortexa7 15 32 9 0.000011049029459002566 +armv7neon_mmm_f32_8x6_cortexa9 17 32 7 0.000010792251610714281 +armv7neon_mmm_f32_8x4_generic 24 32 11 0.000012578955321334167 +armv7neon_mmm_f32_8x6_generic 8 4 17 0.0000025848174603713075 +armv7neon_mmm_f32_8x4_cortexa9 8 128 12 0.000011812308047768464 +generic_f32_4x4 12 128 8 0.00002174331661540877 +armv7neon_mmm_f32_8x4_cortexa7 16 128 12 0.0000317241936699183 +armv7neon_mmm_f32_8x6_cortexa9 9 4 5 0.000002126610313888205 +armv7neon_mmm_f32_8x6_cortexa9 8 32 12 0.0000037644801015301184 +armv7neon_mmm_f32_8x4_cortexa9 9 32 4 0.000003081152195958602 +generic_f32_4x4 7 32 9 0.000008408374041494756 +generic_f32_4x4 4 128 4 0.0000040497405415986515 +armv7neon_mmm_f32_8x4_cortexa9 15 128 7 0.000016202393035594662 +armv7neon_mmm_f32_8x6_cortexa9 25 128 6 0.000020788151245438416 +armv7neon_mmm_f32_8x4_cortexa7 8 128 13 0.000021488179929071295 +armv7neon_mmm_f32_8x4_cortexa7 25 32 9 0.00002091079321932589 +armv7neon_mmm_f32_8x4_cortexa9 8 128 5 0.000008180420598290364 +armv7neon_mmm_f32_8x4_generic 7 128 3 0.000004423621357206741 +armv7neon_mmm_f32_8x4_cortexa9 15 4 9 0.000004394233374561228 +armv7neon_mmm_f32_8x6_generic 7 4 12 0.0000025127271377138368 +armv7neon_mmm_f32_8x4_generic 23 128 8 0.00002364193719899604 +armv7neon_mmm_f32_8x4_cortexa7 7 32 11 0.000005969501269507202 +armv7neon_mmm_f32_8x6_cortexa9 16 4 7 0.0000033362358276857998 +armv7neon_mmm_f32_8x4_generic 25 4 4 0.000002679652210124942 +armv7neon_mmm_f32_8x6_cortexa7 24 128 19 0.00008148701915820504 +armv7neon_mmm_f32_8x4_cortexa7 25 32 12 0.000020301609592643522 +armv7neon_mmm_f32_8x6_generic 25 4 12 0.000005597867792560783 +armv7neon_mmm_f32_8x4_cortexa9 16 4 13 0.0000050370172264265826 +generic_f32_4x4 11 4 8 0.000003987191713644613 +armv7neon_mmm_f32_8x4_cortexa7 7 32 9 0.00000592156011559423 +armv7neon_mmm_f32_8x6_cortexa7 25 4 13 0.000009317851927845776 +armv7neon_mmm_f32_8x6_generic 7 32 6 0.0000025009886657895365 +armv7neon_mmm_f32_8x4_cortexa9 9 32 5 0.0000059031902576708605 +armv7neon_mmm_f32_8x4_cortexa7 25 4 7 0.000005997625417148922 +armv7neon_mmm_f32_8x4_generic 15 128 9 0.000023965770004599912 +armv7neon_mmm_f32_8x4_generic 9 128 3 0.000008336141118552254 +armv7neon_mmm_f32_8x6_cortexa9 15 4 5 0.0000022485015750854945 +armv7neon_mmm_f32_8x4_cortexa7 25 32 13 0.000027508557090202434 +armv7neon_mmm_f32_8x4_cortexa9 7 32 4 0.0000019276370605991633 +armv7neon_mmm_f32_8x6_cortexa7 7 32 13 0.000007512654299558133 +generic_f32_4x4 12 128 12 0.000032295866967880904 +armv7neon_mmm_f32_8x6_generic 8 128 13 0.00001569076394434193 +armv7neon_mmm_f32_8x4_cortexa9 17 128 8 0.000023412675748873913 +armv7neon_mmm_f32_8x4_cortexa9 16 4 3 0.000001964056838467611 +armv7neon_mmm_f32_8x4_generic 16 4 4 0.0000015347326629006993 +armv7neon_mmm_f32_8x4_cortexa9 24 4 4 0.0000020572993016505205 +armv7neon_mmm_f32_8x4_cortexa9 8 32 3 0.0000019507567628415824 +armv7neon_mmm_f32_8x4_cortexa7 8 32 7 0.000004003993118643105 +armv7neon_mmm_f32_8x4_generic 17 4 13 0.000007623324255157955 +armv7neon_mmm_f32_8x6_cortexa9 9 4 11 0.0000035090633204775533 +armv7neon_mmm_f32_8x6_generic 9 4 19 0.00000601244922696809 +armv7neon_mmm_f32_8x6_generic 9 128 7 0.000020875839952630357 +generic_f32_4x4 4 32 11 0.000004326527977488747 +armv7neon_mmm_f32_8x6_cortexa9 25 32 18 0.00002022815888923194 +armv7neon_mmm_f32_8x6_cortexa9 15 4 18 0.000005434062942764287 +armv7neon_mmm_f32_8x4_cortexa9 24 4 3 0.000002704298016252254 +armv7neon_mmm_f32_8x4_generic 24 4 9 0.000005701271915969865 +armv7neon_mmm_f32_8x6_cortexa9 7 128 19 0.000021850407441008384 +armv7neon_mmm_f32_8x4_cortexa9 15 4 3 0.0000019251432800014627 +armv7neon_mmm_f32_8x6_cortexa7 15 4 18 0.000005711033859652051 +armv7neon_mmm_f32_8x6_cortexa7 9 4 17 0.000005102785803942399 +armv7neon_mmm_f32_8x4_generic 8 128 4 0.000004232482987549164 +armv7neon_mmm_f32_8x4_cortexa9 25 128 7 0.00003172200454891407 +armv7neon_mmm_f32_8x4_cortexa9 9 32 12 0.000008361571818242635 +armv7neon_mmm_f32_8x6_cortexa7 24 32 17 0.000019514235657015186 +armv7neon_mmm_f32_8x6_cortexa7 23 128 6 0.000020944404830412158 +armv7neon_mmm_f32_8x6_cortexa9 17 128 5 0.000016134142374025674 +armv7neon_mmm_f32_8x4_cortexa7 8 4 7 0.000001873057019803265 +armv7neon_mmm_f32_8x6_cortexa7 25 128 5 0.00002802974521426182 +armv7neon_mmm_f32_8x6_cortexa9 17 4 17 0.000006958050831218578 +armv7neon_mmm_f32_8x4_cortexa9 15 32 5 0.00000608182468505122 +armv7neon_mmm_f32_8x4_generic 7 128 4 0.0000044404498411270055 +generic_f32_4x4 13 4 5 0.000005350599856702572 +armv7neon_mmm_f32_8x4_generic 16 4 9 0.0000039608523692233575 +armv7neon_mmm_f32_8x6_cortexa9 16 4 12 0.000003013935297522993 +armv7neon_mmm_f32_8x6_generic 15 4 13 0.000005319052004340107 +armv7neon_mmm_f32_8x4_cortexa9 8 4 13 0.0000027408020700732427 +armv7neon_mmm_f32_8x6_generic 8 128 18 0.000015505866042830093 +armv7neon_mmm_f32_8x6_cortexa7 15 32 18 0.000013888471960481304 +armv7neon_mmm_f32_8x4_cortexa7 15 128 8 0.00002174159034943732 +armv7neon_mmm_f32_8x4_generic 9 32 4 0.0000030856158641387716 +armv7neon_mmm_f32_8x4_generic 24 4 5 0.000004129514406774627 +generic_f32_4x4 4 4 7 0.0000017497295508923947 +armv7neon_mmm_f32_8x6_generic 23 32 17 0.00001653018733416135 +generic_f32_4x4 7 128 13 0.000029515913541067617 +armv7neon_mmm_f32_8x6_cortexa9 24 32 19 0.000020360891894208073 +armv7neon_mmm_f32_8x6_cortexa9 9 32 18 0.000010557768641496435 +armv7neon_mmm_f32_8x4_generic 8 4 13 0.000002736501222546797 +armv7neon_mmm_f32_8x4_generic 16 32 11 0.000008543993772071838 +armv7neon_mmm_f32_8x4_cortexa7 7 32 12 0.0000059782618274442185 +armv7neon_mmm_f32_8x6_cortexa7 8 128 11 0.000014095624176931188 +armv7neon_mmm_f32_8x4_cortexa7 7 32 13 0.00000775363354362587 +armv7neon_mmm_f32_8x4_cortexa7 24 128 5 0.000032299734402349505 +armv7neon_mmm_f32_8x6_cortexa9 25 32 12 0.000013697867410484751 +armv7neon_mmm_f32_8x4_cortexa7 24 128 4 0.000016113162651910352 +armv7neon_mmm_f32_8x6_cortexa9 9 32 5 0.0000041006560791439575 +generic_f32_4x4 7 4 11 0.000004296735480110931 +armv7neon_mmm_f32_8x6_generic 8 32 11 0.000003977003783384346 +armv7neon_mmm_f32_8x4_generic 24 32 13 0.000016227668114322495 +armv7neon_mmm_f32_8x6_generic 7 32 5 0.0000023419854212729095 +generic_f32_4x4 3 128 3 0.00000421323412258659 +generic_f32_4x4 8 4 12 0.0000036950676170218278 +armv7neon_mmm_f32_8x4_cortexa7 15 4 12 0.000004609879642718733 +armv7neon_mmm_f32_8x6_generic 8 128 11 0.000010745832002669769 +armv7neon_mmm_f32_8x6_cortexa7 16 32 18 0.000012726640475291294 +armv7neon_mmm_f32_8x6_generic 24 4 13 0.000006477448734654056 +armv7neon_mmm_f32_8x4_cortexa9 23 128 5 0.000023893090361578605 +armv7neon_mmm_f32_8x6_cortexa9 7 128 7 0.000011059489910827611 +armv7neon_mmm_f32_8x4_cortexa7 24 4 13 0.000007916232793484605 +armv7neon_mmm_f32_8x6_generic 24 32 5 0.000006050244864860637 +armv7neon_mmm_f32_8x6_generic 23 4 5 0.000003050740283377829 +armv7neon_mmm_f32_8x4_generic 17 4 7 0.000004317179186717401 +generic_f32_4x4 13 128 3 0.000015316541870867625 +armv7neon_mmm_f32_8x6_generic 25 128 11 0.00004196449161824797 +armv7neon_mmm_f32_8x6_generic 24 32 7 0.000010654786791684403 +armv7neon_mmm_f32_8x6_generic 9 128 13 0.0000310078542141026 +armv7neon_mmm_f32_8x4_cortexa7 17 128 13 0.00006395441906545403 +armv7neon_mmm_f32_8x6_cortexa7 25 4 11 0.000006802751222800881 +armv7neon_mmm_f32_8x6_cortexa7 16 32 12 0.000008630037205832748 +armv7neon_mmm_f32_8x6_cortexa9 8 128 6 0.000005530202507023102 +armv7neon_mmm_f32_8x4_cortexa7 24 4 3 0.000002895093469687887 +armv7neon_mmm_f32_8x6_cortexa7 23 32 13 0.00002011431966689971 +armv7neon_mmm_f32_8x6_cortexa7 8 128 17 0.000020781906415830542 +armv7neon_mmm_f32_8x6_cortexa7 8 128 7 0.00001401297102177983 +generic_f32_4x4 12 128 4 0.000011113783764067 +armv7neon_mmm_f32_8x4_cortexa9 23 128 8 0.000023633711148043694 +armv7neon_mmm_f32_8x6_cortexa7 8 32 7 0.00000473866242690848 +armv7neon_mmm_f32_8x6_cortexa7 23 128 13 0.00006200535049902772 +armv7neon_mmm_f32_8x6_generic 25 128 12 0.000040933940992834904 +armv7neon_mmm_f32_8x6_generic 8 4 7 0.0000019018312390917206 +armv7neon_mmm_f32_8x6_cortexa9 9 128 13 0.0000311113672962118 +armv7neon_mmm_f32_8x6_cortexa7 17 32 6 0.000006712613519894369 +armv7neon_mmm_f32_8x6_generic 17 32 17 0.00001584305104507533 +armv7neon_mmm_f32_8x6_cortexa9 25 4 5 0.000003846834546374256 +armv7neon_mmm_f32_8x4_generic 23 128 11 0.00003563405640596688 +armv7neon_mmm_f32_8x6_cortexa7 7 32 18 0.000007762095957340233 +armv7neon_mmm_f32_8x4_cortexa7 8 4 3 0.000001278724116798178 +armv7neon_mmm_f32_8x4_generic 8 32 9 0.000004428552791795867 +armv7neon_mmm_f32_8x4_cortexa7 9 32 8 0.000007212913220352459 +armv7neon_mmm_f32_8x6_cortexa7 16 32 11 0.000009132197785046325 +armv7neon_mmm_f32_8x6_generic 15 4 6 0.0000021293993921351435 +armv7neon_mmm_f32_8x6_cortexa9 8 32 17 0.000005618722861518267 +generic_f32_4x4 12 32 8 0.000007824605805291677 +generic_f32_4x4 3 4 9 0.0000025426825999668207 +armv7neon_mmm_f32_8x6_cortexa9 15 128 6 0.000010938928121944877 +armv7neon_mmm_f32_8x6_generic 8 128 12 0.000010479271785006985 +armv7neon_mmm_f32_8x6_cortexa7 25 4 18 0.000008843120394707296 +generic_f32_4x4 5 128 8 0.000014867005692635484 +armv7neon_mmm_f32_8x4_generic 25 4 12 0.000007134980758480078 +armv7neon_mmm_f32_8x6_generic 23 4 6 0.0000027550692223444384 +generic_f32_4x4 8 128 4 0.000007594935908642548 +armv7neon_mmm_f32_8x4_generic 24 4 3 0.0000027070026756528944 +armv7neon_mmm_f32_8x6_generic 15 4 7 0.0000036879615500804916 +armv7neon_mmm_f32_8x6_cortexa7 8 4 13 0.0000026670243954904784 +armv7neon_mmm_f32_8x6_generic 17 4 7 0.000004739940833422853 +armv7neon_mmm_f32_8x6_generic 24 128 11 0.00003139196614178293 +armv7neon_mmm_f32_8x6_cortexa9 24 128 13 0.0000461884138564535 +generic_f32_4x4 9 128 9 0.00003291260190607621 +armv7neon_mmm_f32_8x4_cortexa9 7 128 11 0.000012420788787006248 +armv7neon_mmm_f32_8x6_generic 17 128 12 0.00003088346853188957 +armv7neon_mmm_f32_8x4_cortexa7 8 4 13 0.000002968959195086507 +armv7neon_mmm_f32_8x6_cortexa9 24 128 6 0.000015548482246921477 +armv7neon_mmm_f32_8x4_cortexa7 17 32 7 0.000010943141781236415 +armv7neon_mmm_f32_8x6_cortexa7 16 4 12 0.000003160151719580885 +generic_f32_4x4 5 128 5 0.000014996303737029157 +armv7neon_mmm_f32_8x6_generic 25 4 6 0.0000030664133228659144 +armv7neon_mmm_f32_8x4_cortexa9 9 4 11 0.0000041810312425253365 +armv7neon_mmm_f32_8x6_generic 15 4 18 0.000005353071509503547 +armv7neon_mmm_f32_8x6_cortexa7 8 32 12 0.000004562116220290897 +armv7neon_mmm_f32_8x4_cortexa9 8 32 7 0.0000032056568363054847 +armv7neon_mmm_f32_8x4_generic 16 4 11 0.0000040609047361039316 +armv7neon_mmm_f32_8x6_cortexa9 17 4 19 0.00000876633957316096 +armv7neon_mmm_f32_8x6_cortexa7 15 128 18 0.00004180200146350024 +generic_f32_4x4 5 128 9 0.000022192516532888443 +armv7neon_mmm_f32_8x6_cortexa9 24 32 11 0.000010954278251461477 +armv7neon_mmm_f32_8x6_cortexa9 8 32 5 0.000002372663230804753 +armv7neon_mmm_f32_8x6_generic 8 4 5 0.0000013605175071348517 +armv7neon_mmm_f32_8x6_cortexa7 24 4 6 0.0000025054020094278158 +armv7neon_mmm_f32_8x6_generic 24 128 5 0.000016222838915570737 +generic_f32_4x4 12 32 4 0.000004159176762509275 +generic_f32_4x4 13 4 11 0.000007582485226342242 +generic_f32_4x4 9 4 5 0.000004196638831417376 +armv7neon_mmm_f32_8x6_cortexa7 25 32 17 0.000026071317528366354 +armv7neon_mmm_f32_8x4_cortexa9 16 128 8 0.000015686411664235285 +generic_f32_4x4 13 128 11 0.00004380249651194818 +armv7neon_mmm_f32_8x4_cortexa7 24 32 13 0.000020563104321953744 +armv7neon_mmm_f32_8x4_cortexa9 23 4 7 0.0000044861575535086865 +generic_f32_4x4 12 128 3 0.000011613772328414718 +armv7neon_mmm_f32_8x6_cortexa7 16 4 19 0.000006163473976132481 +generic_f32_4x4 7 4 8 0.0000029313315346635237 +armv7neon_mmm_f32_8x4_cortexa9 23 32 12 0.000012545342985454958 +armv7neon_mmm_f32_8x6_cortexa9 7 128 11 0.000011136453503507945 +armv7neon_mmm_f32_8x6_cortexa7 9 128 13 0.000041110907972239005 +armv7neon_mmm_f32_8x4_cortexa9 7 32 8 0.000003418961974304266 +armv7neon_mmm_f32_8x4_cortexa7 24 32 12 0.000015105629538772432 +generic_f32_4x4 7 128 3 0.000007903637217361153 +armv7neon_mmm_f32_8x6_cortexa7 9 4 11 0.000003661168986658601 +generic_f32_4x4 4 128 9 0.00001126621217728165 +armv7neon_mmm_f32_8x4_cortexa7 23 128 3 0.000016760586455833876 +armv7neon_mmm_f32_8x6_cortexa9 16 32 12 0.000007056930252112048 +armv7neon_mmm_f32_8x6_cortexa9 8 4 19 0.000003165350777332514 +armv7neon_mmm_f32_8x6_generic 8 32 13 0.000005498586776368978 +generic_f32_4x4 3 128 13 0.000015227075547579779 +generic_f32_4x4 12 4 13 0.000007217490541690208 +armv7neon_mmm_f32_8x6_cortexa7 16 128 18 0.000040584526180649477 +armv7neon_mmm_f32_8x6_cortexa9 25 4 7 0.000006216705461889991 +armv7neon_mmm_f32_8x4_generic 24 128 9 0.000035021164515684145 +generic_f32_4x4 3 32 12 0.000004609887398147942 +armv7neon_mmm_f32_8x4_generic 25 128 9 0.00004684932933132534 +armv7neon_mmm_f32_8x6_generic 7 4 17 0.0000033628632970040393 +generic_f32_4x4 4 128 12 0.000011148897561634414 +armv7neon_mmm_f32_8x4_generic 8 128 8 0.000008028324048468155 +armv7neon_mmm_f32_8x4_generic 8 4 11 0.0000022515619534309057 +armv7neon_mmm_f32_8x4_cortexa7 9 32 5 0.00000739044702644143 +armv7neon_mmm_f32_8x4_generic 8 128 5 0.000008179707225297848 +armv7neon_mmm_f32_8x4_generic 9 4 4 0.0000016079691037580586 +armv7neon_mmm_f32_8x4_generic 16 128 4 0.00000803169255760427 +armv7neon_mmm_f32_8x4_generic 25 128 3 0.000016355748669331108 +armv7neon_mmm_f32_8x6_cortexa9 24 4 18 0.0000061179448610506165 +armv7neon_mmm_f32_8x6_cortexa9 16 4 19 0.000005803928147585998 +armv7neon_mmm_f32_8x6_cortexa7 17 128 6 0.00002064753338392539 +armv7neon_mmm_f32_8x4_cortexa9 9 4 12 0.000003936841431536775 +armv7neon_mmm_f32_8x4_cortexa7 17 32 13 0.000020815595163068456 +armv7neon_mmm_f32_8x6_cortexa7 23 32 11 0.000013810001606181058 +armv7neon_mmm_f32_8x4_cortexa7 15 32 13 0.00001449569792912252 +armv7neon_mmm_f32_8x4_cortexa7 7 4 3 0.0000012380899483592991 +armv7neon_mmm_f32_8x6_cortexa9 23 128 17 0.00004728149304262184 +armv7neon_mmm_f32_8x4_cortexa7 15 32 3 0.000004158381257275162 +armv7neon_mmm_f32_8x4_cortexa7 9 4 9 0.000004430434184731522 +armv7neon_mmm_f32_8x6_generic 23 128 13 0.00004715382202102794 +armv7neon_mmm_f32_8x6_cortexa9 9 4 19 0.000006116079821760871 +armv7neon_mmm_f32_8x6_cortexa9 7 4 18 0.0000035442600221807646 +armv7neon_mmm_f32_8x4_generic 15 128 5 0.000016261604433183137 +armv7neon_mmm_f32_8x4_cortexa9 8 128 4 0.000004251382345566181 +armv7neon_mmm_f32_8x6_cortexa7 25 128 11 0.000055294824394928404 +armv7neon_mmm_f32_8x6_cortexa9 23 128 13 0.00004716175024845285 +generic_f32_4x4 12 128 11 0.00003283409583810123 +armv7neon_mmm_f32_8x6_cortexa7 15 128 6 0.000014244472752339967 +armv7neon_mmm_f32_8x4_generic 17 32 8 0.000008307634701683482 +armv7neon_mmm_f32_8x4_generic 16 4 13 0.000005034888545956025 +armv7neon_mmm_f32_8x6_cortexa7 9 128 11 0.000027682771225447373 +armv7neon_mmm_f32_8x4_generic 25 128 8 0.000030970064243633 +armv7neon_mmm_f32_8x6_cortexa9 15 32 5 0.0000042126470351483745 +armv7neon_mmm_f32_8x6_cortexa9 8 128 7 0.000010688502492077545 +armv7neon_mmm_f32_8x6_cortexa7 8 32 19 0.000008785622923040567 +generic_f32_4x4 8 32 7 0.000005699367356537332 +armv7neon_mmm_f32_8x4_cortexa9 23 4 13 0.000007982321009915726 +armv7neon_mmm_f32_8x6_cortexa9 7 32 19 0.000008296837727225683 +armv7neon_mmm_f32_8x4_cortexa9 17 128 12 0.00003482723156416195 +armv7neon_mmm_f32_8x4_cortexa7 16 4 12 0.000003962627391393735 +armv7neon_mmm_f32_8x4_generic 24 32 4 0.000004275975584058462 +armv7neon_mmm_f32_8x4_generic 15 4 4 0.00000172768002087017 +generic_f32_4x4 12 128 13 0.000043289852359027704 +armv7neon_mmm_f32_8x6_cortexa7 24 32 18 0.00001876149316286456 +armv7neon_mmm_f32_8x4_cortexa9 23 128 13 0.000047081790611742305 +armv7neon_mmm_f32_8x4_generic 25 4 8 0.000004894765654717258 +generic_f32_4x4 13 32 9 0.000015809325344243507 +armv7neon_mmm_f32_8x6_cortexa9 15 32 11 0.000007869115330631703 +armv7neon_mmm_f32_8x6_cortexa7 16 32 19 0.000017064208112188813 +armv7neon_mmm_f32_8x4_cortexa7 9 32 12 0.000010571719001405112 +armv7neon_mmm_f32_8x6_cortexa9 7 4 12 0.000002545629385927187 +armv7neon_mmm_f32_8x4_cortexa9 24 128 5 0.00002368492009732677 +armv7neon_mmm_f32_8x6_cortexa7 25 4 5 0.000004026698373090156 +generic_f32_4x4 9 32 9 0.000012044684258478282 +armv7neon_mmm_f32_8x6_cortexa9 25 128 12 0.000040909803644855135 +generic_f32_4x4 8 128 12 0.00002170370665519434 +armv7neon_mmm_f32_8x6_generic 23 4 13 0.000007322110702891982 +armv7neon_mmm_f32_8x4_generic 8 128 9 0.000011964404434625917 +armv7neon_mmm_f32_8x6_generic 24 4 6 0.00000234486203657902 +armv7neon_mmm_f32_8x4_generic 16 128 12 0.000023289775015844785 +armv7neon_mmm_f32_8x4_generic 25 32 4 0.00000566239966628999 +armv7neon_mmm_f32_8x4_cortexa7 8 128 7 0.000011133117389377919 +armv7neon_mmm_f32_8x4_cortexa7 23 128 7 0.000032614653457132354 +generic_f32_4x4 11 128 3 0.000011596634661777904 +armv7neon_mmm_f32_8x6_generic 9 32 7 0.000007301874181587425 +armv7neon_mmm_f32_8x6_cortexa7 25 4 6 0.0000032842534990270153 +armv7neon_mmm_f32_8x4_cortexa7 25 4 11 0.000008398472143479583 +armv7neon_mmm_f32_8x4_cortexa9 17 128 3 0.000012367498051511246 +armv7neon_mmm_f32_8x4_cortexa9 15 4 4 0.0000017226474127640693 +armv7neon_mmm_f32_8x6_cortexa9 8 4 12 0.0000017663174571893155 +armv7neon_mmm_f32_8x6_generic 24 32 18 0.000014939528824089691 +armv7neon_mmm_f32_8x4_cortexa9 15 128 3 0.000008424928636646119 +armv7neon_mmm_f32_8x4_cortexa9 25 32 8 0.000010844406621465396 +armv7neon_mmm_f32_8x6_generic 24 128 6 0.00001549274717546449 +armv7neon_mmm_f32_8x4_cortexa9 7 4 8 0.0000019411905486089406 +generic_f32_4x4 5 4 12 0.000003970968097394295 +armv7neon_mmm_f32_8x4_cortexa7 17 128 3 0.000016675513856755485 +armv7neon_mmm_f32_8x6_generic 15 128 17 0.0000319988391989146 +armv7neon_mmm_f32_8x4_generic 8 4 3 0.0000012046114173177262 +armv7neon_mmm_f32_8x4_generic 9 4 13 0.000005271381310779438 +armv7neon_mmm_f32_8x6_generic 25 4 5 0.000003787680148419445 +armv7neon_mmm_f32_8x6_cortexa9 16 4 13 0.00000455965655028581 +armv7neon_mmm_f32_8x6_generic 16 128 12 0.000020507038874403576 +armv7neon_mmm_f32_8x4_cortexa9 7 4 3 0.0000011789464739113475 +armv7neon_mmm_f32_8x4_cortexa7 15 4 9 0.000004718086287004601 +generic_f32_4x4 3 32 8 0.0000032590697277049294 +armv7neon_mmm_f32_8x4_cortexa7 25 32 11 0.000021029318586240916 +armv7neon_mmm_f32_8x4_cortexa9 7 128 7 0.000008425064090676705 +armv7neon_mmm_f32_8x4_cortexa9 23 128 7 0.000024027014834706714 +armv7neon_mmm_f32_8x4_cortexa9 9 128 13 0.000031337531340777375 +generic_f32_4x4 12 32 3 0.0000046491920750409604 +armv7neon_mmm_f32_8x4_cortexa7 9 128 12 0.00003204704023977606 +armv7neon_mmm_f32_8x6_cortexa9 15 128 5 0.000010976146066847136 +armv7neon_mmm_f32_8x6_generic 9 128 5 0.000010821808595127155 +armv7neon_mmm_f32_8x6_cortexa9 16 32 18 0.000010217778748183444 +armv7neon_mmm_f32_8x6_cortexa9 8 32 13 0.000005537991894862071 +armv7neon_mmm_f32_8x6_cortexa7 15 128 11 0.000028075729994494098 +armv7neon_mmm_f32_8x6_cortexa7 24 4 7 0.000004992618899850428 +armv7neon_mmm_f32_8x6_generic 15 32 6 0.000004128781340215273 +armv7neon_mmm_f32_8x6_generic 15 32 18 0.000011342396294923752 +generic_f32_4x4 3 128 9 0.000011614205849736826 +armv7neon_mmm_f32_8x4_generic 7 32 9 0.000004862452767728157 +armv7neon_mmm_f32_8x4_cortexa9 17 4 7 0.000004308345799806778 +armv7neon_mmm_f32_8x6_cortexa9 16 32 13 0.000010598291750058129 +armv7neon_mmm_f32_8x4_generic 17 32 7 0.000008775628241420315 +armv7neon_mmm_f32_8x4_cortexa9 8 4 8 0.00000151354666080862 +armv7neon_mmm_f32_8x6_cortexa7 9 4 19 0.0000064774482704190535 +armv7neon_mmm_f32_8x4_cortexa9 9 4 4 0.000001610951102254284 +armv7neon_mmm_f32_8x4_cortexa7 7 128 3 0.00000588268473785512 +armv7neon_mmm_f32_8x4_cortexa9 9 128 7 0.00001614593088465898 +armv7neon_mmm_f32_8x6_cortexa9 9 128 6 0.000010694366815766805 +armv7neon_mmm_f32_8x6_cortexa9 17 4 5 0.000003000192050393217 +armv7neon_mmm_f32_8x4_cortexa9 8 32 4 0.0000017239720455885444 +armv7neon_mmm_f32_8x6_cortexa7 25 128 18 0.0000814675708223554 +generic_f32_4x4 12 128 7 0.000022191318720751985 +generic_f32_4x4 5 128 13 0.000029341360540777852 +generic_f32_4x4 7 4 5 0.000003063610185404798 +armv7neon_mmm_f32_8x4_cortexa7 23 32 12 0.000015790133040211727 +armv7neon_mmm_f32_8x6_cortexa7 17 128 13 0.00006139081303315256 +armv7neon_mmm_f32_8x4_cortexa9 8 32 12 0.000004268120941798172 +armv7neon_mmm_f32_8x6_generic 16 32 17 0.000010656142728555816 +generic_f32_4x4 13 4 8 0.000004925203028119758 +armv7neon_mmm_f32_8x6_generic 16 4 6 0.0000017414671485444925 +armv7neon_mmm_f32_8x6_generic 23 4 7 0.000005090461887505994 +armv7neon_mmm_f32_8x4_cortexa9 7 128 4 0.000004439163092495918 +armv7neon_mmm_f32_8x4_cortexa9 15 32 4 0.000003199516110418427 +generic_f32_4x4 13 4 9 0.000007521941079552681 +armv7neon_mmm_f32_8x4_generic 9 128 13 0.00003132672472959027 +armv7neon_mmm_f32_8x6_cortexa7 9 128 5 0.000014186965083572584 +armv7neon_mmm_f32_8x4_cortexa9 16 32 13 0.000010950995485777078 +armv7neon_mmm_f32_8x4_cortexa7 24 4 11 0.000006343455120468695 +armv7neon_mmm_f32_8x4_generic 23 32 12 0.000012518582750578088 +armv7neon_mmm_f32_8x4_cortexa9 23 4 5 0.000004357470789736493 +armv7neon_mmm_f32_8x6_generic 9 128 12 0.000020738827544116163 +armv7neon_mmm_f32_8x4_generic 7 128 8 0.000008441257822863349 +armv7neon_mmm_f32_8x6_generic 25 4 18 0.000008206815180054958 +armv7neon_mmm_f32_8x6_cortexa7 16 128 7 0.00002754330318348781 +armv7neon_mmm_f32_8x4_cortexa9 15 128 12 0.000023818850811398474 +armv7neon_mmm_f32_8x6_cortexa7 7 32 5 0.0000027477786822716826 +generic_f32_4x4 4 32 9 0.000004310962413140207 +armv7neon_mmm_f32_8x6_cortexa9 9 32 11 0.000007472405716714093 +generic_f32_4x4 9 4 7 0.0000042110601811121985 +armv7neon_mmm_f32_8x4_cortexa9 25 128 5 0.00003145000193606305 +armv7neon_mmm_f32_8x6_generic 16 128 7 0.000020961637268522957 +armv7neon_mmm_f32_8x6_cortexa7 8 128 18 0.00002061441681055432 +armv7neon_mmm_f32_8x6_cortexa7 15 32 11 0.000009542090957976056 +armv7neon_mmm_f32_8x4_cortexa9 8 32 8 0.0000030204030341535007 +armv7neon_mmm_f32_8x6_cortexa9 16 4 18 0.000004289256443600587 +armv7neon_mmm_f32_8x4_cortexa7 15 32 8 0.000007451057144596197 +armv7neon_mmm_f32_8x6_cortexa7 25 128 12 0.00005431857023175915 +armv7neon_mmm_f32_8x6_generic 7 128 11 0.000011099973931488493 +armv7neon_mmm_f32_8x4_cortexa9 16 128 11 0.000023648803168223167 +armv7neon_mmm_f32_8x6_cortexa9 17 4 12 0.0000044808063412694585 +armv7neon_mmm_f32_8x6_generic 25 32 7 0.000014053446208239492 +armv7neon_mmm_f32_8x6_generic 16 128 19 0.0000411706712809725 +armv7neon_mmm_f32_8x4_generic 16 32 13 0.000010937227005963747 +armv7neon_mmm_f32_8x4_cortexa7 8 32 3 0.000002329652194595725 +armv7neon_mmm_f32_8x4_cortexa9 23 4 11 0.000006308879061820964 +armv7neon_mmm_f32_8x6_cortexa7 16 4 7 0.0000035065261101064537 +armv7neon_mmm_f32_8x6_generic 9 128 6 0.000010599309996894829 +armv7neon_mmm_f32_8x4_cortexa7 24 128 11 0.000048035927729516165 +armv7neon_mmm_f32_8x6_generic 7 32 11 0.000004333152377602563 +armv7neon_mmm_f32_8x4_cortexa7 15 4 11 0.000004824968358151188 +armv7neon_mmm_f32_8x6_cortexa7 7 32 6 0.0000029036965940094368 +generic_f32_4x4 12 4 3 0.0000026010633583656195 +armv7neon_mmm_f32_8x6_cortexa9 8 32 18 0.000005373878070970815 +armv7neon_mmm_f32_8x4_generic 16 128 7 0.000016057054409709098 +armv7neon_mmm_f32_8x4_generic 17 128 5 0.000023713194238003484 +armv7neon_mmm_f32_8x6_generic 24 128 19 0.0000639824281045487 +generic_f32_4x4 5 4 13 0.000005260715034760787 +armv7neon_mmm_f32_8x6_cortexa9 16 128 19 0.000040932625772708764 +armv7neon_mmm_f32_8x4_generic 16 32 5 0.00000587187854127478 +armv7neon_mmm_f32_8x4_generic 15 32 4 0.0000032031359648686487 +armv7neon_mmm_f32_8x4_cortexa9 24 32 13 0.00001619115724351509 +armv7neon_mmm_f32_8x4_generic 15 32 8 0.000005976862649343733 +armv7neon_mmm_f32_8x6_cortexa9 16 32 7 0.000007329062657266122 +armv7neon_mmm_f32_8x4_cortexa9 25 32 7 0.00001147457512424339 +armv7neon_mmm_f32_8x4_cortexa9 24 128 3 0.000012480179872163079 +armv7neon_mmm_f32_8x6_generic 16 4 13 0.000004503173649426117 +armv7neon_mmm_f32_8x6_generic 16 4 19 0.000005734445256719681 +armv7neon_mmm_f32_8x4_cortexa7 24 4 8 0.000003952435244053743 +armv7neon_mmm_f32_8x6_cortexa9 9 32 6 0.000003904765036328731 +armv7neon_mmm_f32_8x6_cortexa7 23 4 13 0.000007856276222270553 +armv7neon_mmm_f32_8x6_cortexa9 15 4 6 0.000002182843988681052 +armv7neon_mmm_f32_8x6_generic 23 4 18 0.000007195841362691159 +armv7neon_mmm_f32_8x4_cortexa7 16 32 12 0.000010255406371017558 +armv7neon_mmm_f32_8x4_cortexa9 25 128 8 0.0000309762014941751 +armv7neon_mmm_f32_8x6_cortexa7 25 128 17 0.00008204444005539351 +armv7neon_mmm_f32_8x6_cortexa7 17 4 6 0.0000026268622339640763 +armv7neon_mmm_f32_8x4_cortexa9 15 32 9 0.000008859604622252251 +generic_f32_4x4 5 32 9 0.000008264557275071169 +generic_f32_4x4 13 32 5 0.000010823964577863127 +generic_f32_4x4 5 4 3 0.000001851289980714991 +armv7neon_mmm_f32_8x4_generic 15 128 4 0.000008225264190077282 +armv7neon_mmm_f32_8x6_generic 17 32 18 0.000015295586544658026 +armv7neon_mmm_f32_8x6_cortexa9 9 4 18 0.000004572833723975051 +armv7neon_mmm_f32_8x4_cortexa7 15 4 13 0.000006075307141015665 +armv7neon_mmm_f32_8x6_cortexa7 17 32 19 0.000025630924279728742 +generic_f32_4x4 7 32 7 0.0000058385014382941885 +armv7neon_mmm_f32_8x4_cortexa9 16 128 7 0.000016054999854639244 +armv7neon_mmm_f32_8x4_cortexa7 25 4 4 0.0000029326008363454604 +armv7neon_mmm_f32_8x6_cortexa7 16 128 12 0.000027204121533574832 +armv7neon_mmm_f32_8x4_generic 15 128 3 0.000008426038232256742 +armv7neon_mmm_f32_8x4_cortexa9 7 32 5 0.0000033516027030490246 +armv7neon_mmm_f32_8x4_cortexa7 16 4 7 0.0000032624014554442194 +armv7neon_mmm_f32_8x4_cortexa9 16 128 3 0.000008462143402947945 +armv7neon_mmm_f32_8x4_generic 25 4 7 0.000005557466852852917 +armv7neon_mmm_f32_8x4_generic 23 32 4 0.000004495607981128392 +armv7neon_mmm_f32_8x4_cortexa9 15 4 12 0.000004291414852711907 +generic_f32_4x4 4 4 13 0.0000027727007811134473 +armv7neon_mmm_f32_8x4_generic 16 4 8 0.0000025796442872451462 +armv7neon_mmm_f32_8x4_cortexa9 23 4 9 0.000006179316744537218 +armv7neon_mmm_f32_8x4_cortexa7 25 32 3 0.000007764249905705397 +armv7neon_mmm_f32_8x6_generic 17 32 7 0.000010687826852062675 +generic_f32_4x4 13 4 7 0.000005399817435719611 +generic_f32_4x4 12 4 4 0.0000021094685699886824 +armv7neon_mmm_f32_8x6_cortexa9 7 4 17 0.0000034025163658270913 +generic_f32_4x4 4 4 9 0.0000022359497586694876 +armv7neon_mmm_f32_8x6_cortexa9 17 128 17 0.00004660020476545699 +armv7neon_mmm_f32_8x4_generic 15 4 12 0.00000428385702551851 +armv7neon_mmm_f32_8x4_generic 9 4 11 0.000004185007110545824 +armv7neon_mmm_f32_8x6_cortexa9 24 128 18 0.000045657972165917084 +armv7neon_mmm_f32_8x6_cortexa9 15 4 19 0.000007027336593004323 +armv7neon_mmm_f32_8x4_cortexa9 9 128 9 0.00002367365046977455 +armv7neon_mmm_f32_8x4_cortexa9 16 4 9 0.000003960113146754688 +armv7neon_mmm_f32_8x4_cortexa7 8 32 9 0.000005561279017749759 +armv7neon_mmm_f32_8x4_cortexa9 23 32 3 0.000004897369194486712 +armv7neon_mmm_f32_8x4_cortexa9 16 32 3 0.0000034377989421061634 +armv7neon_mmm_f32_8x6_generic 25 32 5 0.00000775592482259445 +armv7neon_mmm_f32_8x4_cortexa9 9 128 11 0.000023733495515623523 +armv7neon_mmm_f32_8x4_generic 23 32 7 0.000008935361824557136 +armv7neon_mmm_f32_8x4_cortexa7 7 128 9 0.00001664390737173506 +generic_f32_4x4 5 4 8 0.0000028477215638418112 +armv7neon_mmm_f32_8x6_generic 23 4 19 0.000009557906532901389 +armv7neon_mmm_f32_8x6_cortexa9 16 128 12 0.000020636825667982075 +armv7neon_mmm_f32_8x6_generic 9 32 11 0.000007421682594966975 +armv7neon_mmm_f32_8x4_generic 23 128 13 0.000047353976390946026 +armv7neon_mmm_f32_8x4_cortexa7 17 4 7 0.000004622886247456753 +armv7neon_mmm_f32_8x6_cortexa9 8 128 5 0.000005754129281131009 +armv7neon_mmm_f32_8x4_cortexa7 9 128 11 0.000032319021391833177 +armv7neon_mmm_f32_8x4_cortexa9 24 4 9 0.000005698490674755626 +armv7neon_mmm_f32_8x6_generic 17 4 18 0.000006326121915439696 +armv7neon_mmm_f32_8x4_cortexa9 16 128 12 0.000023196293109823396 +armv7neon_mmm_f32_8x4_cortexa9 25 32 9 0.000016534317998128363 +armv7neon_mmm_f32_8x6_cortexa7 24 4 5 0.0000032362274636792855 +armv7neon_mmm_f32_8x6_cortexa7 9 32 13 0.000013201464669659568 +armv7neon_mmm_f32_8x4_generic 25 128 5 0.000031432252054608495 +armv7neon_mmm_f32_8x4_cortexa7 16 32 8 0.000007004694197132091 +armv7neon_mmm_f32_8x4_cortexa7 25 128 8 0.00004234315278499361 +armv7neon_mmm_f32_8x6_cortexa9 25 128 11 0.00004162498921103509 +armv7neon_mmm_f32_8x4_cortexa9 17 4 11 0.000005990145971795821 +armv7neon_mmm_f32_8x6_generic 7 128 5 0.000005711586442858516 +generic_f32_4x4 5 32 7 0.000005745084475312582 +armv7neon_mmm_f32_8x4_cortexa7 17 4 3 0.0000027778998365925647 +armv7neon_mmm_f32_8x6_generic 7 32 17 0.000006323975084622719 +armv7neon_mmm_f32_8x4_cortexa7 24 32 4 0.0000053627426905310905 +armv7neon_mmm_f32_8x6_cortexa9 7 128 13 0.000016465054999968026 +armv7neon_mmm_f32_8x6_cortexa9 25 32 17 0.000021029276181809713 +armv7neon_mmm_f32_8x6_generic 7 4 18 0.0000035040816774469106 +armv7neon_mmm_f32_8x6_cortexa7 17 128 12 0.00004084848200658872 +armv7neon_mmm_f32_8x6_cortexa7 16 128 11 0.000027707012596319906 +armv7neon_mmm_f32_8x6_cortexa7 8 128 12 0.0000138330433065325 +armv7neon_mmm_f32_8x4_cortexa9 7 32 9 0.000004837549683885907 +armv7neon_mmm_f32_8x4_generic 7 32 7 0.000003409099607922408 +armv7neon_mmm_f32_8x6_cortexa7 16 4 5 0.0000023317089411955496 +armv7neon_mmm_f32_8x6_cortexa7 7 32 17 0.000007581121899800583 +armv7neon_mmm_f32_8x4_cortexa7 16 4 9 0.0000042985642379541856 +armv7neon_mmm_f32_8x6_generic 16 4 7 0.0000032960593218992506 +armv7neon_mmm_f32_8x4_generic 15 32 5 0.0000060704297110345465 +armv7neon_mmm_f32_8x6_cortexa9 24 32 6 0.00000538028834244819 +armv7neon_mmm_f32_8x6_generic 25 32 18 0.0000200993477092098 +armv7neon_mmm_f32_8x6_generic 23 128 6 0.00001591506566691279 +armv7neon_mmm_f32_8x6_generic 16 32 13 0.00001049572423866497 +armv7neon_mmm_f32_8x4_cortexa7 15 32 5 0.000007565304970371118 +generic_f32_4x4 12 32 7 0.000008258205556820225 +armv7neon_mmm_f32_8x4_cortexa7 17 4 9 0.000006349279290785205 +generic_f32_4x4 8 128 5 0.000014938883570524155 +armv7neon_mmm_f32_8x4_generic 15 32 3 0.0000034015953691065083 +armv7neon_mmm_f32_8x6_cortexa7 7 4 18 0.000003692100644999123 +generic_f32_4x4 8 128 7 0.000015006901203342376 +generic_f32_4x4 4 4 3 0.0000012181206340328286 +generic_f32_4x4 12 32 5 0.000008206014399831754 +armv7neon_mmm_f32_8x6_cortexa9 8 128 19 0.000020717968248585576 +generic_f32_4x4 5 32 3 0.000003225094305304002 +armv7neon_mmm_f32_8x6_generic 24 4 18 0.000006021495896771489 +armv7neon_mmm_f32_8x6_cortexa7 16 128 13 0.00004093551587387655 +armv7neon_mmm_f32_8x4_cortexa9 15 128 8 0.00001604448990415154 +armv7neon_mmm_f32_8x6_generic 9 32 13 0.000010651318368732651 +armv7neon_mmm_f32_8x4_cortexa7 8 32 8 0.000003772107419321899 +generic_f32_4x4 9 32 3 0.000004621846936980697 +armv7neon_mmm_f32_8x4_cortexa9 16 32 5 0.000005906329928132943 +armv7neon_mmm_f32_8x6_cortexa9 16 4 6 0.000001808587040184933 +armv7neon_mmm_f32_8x6_cortexa9 9 32 13 0.000010778660548919894 +armv7neon_mmm_f32_8x4_cortexa9 16 32 4 0.0000030105150384631114 +armv7neon_mmm_f32_8x6_generic 17 4 17 0.000006838981837746129 +armv7neon_mmm_f32_8x6_cortexa7 7 128 19 0.000028499453971524397 +armv7neon_mmm_f32_8x6_cortexa7 23 32 17 0.000020319754403004648 +armv7neon_mmm_f32_8x6_generic 9 32 17 0.000010765407372129566 +armv7neon_mmm_f32_8x4_generic 23 4 13 0.000008003285656461567 +armv7neon_mmm_f32_8x4_generic 17 4 4 0.000002161502194915875 +armv7neon_mmm_f32_8x4_generic 23 4 11 0.000006323737070277622 +armv7neon_mmm_f32_8x4_generic 9 4 12 0.00000394067683542442 +armv7neon_mmm_f32_8x4_cortexa7 23 128 4 0.00001636523555605817 +armv7neon_mmm_f32_8x4_cortexa9 17 32 12 0.000012187305627907507 +armv7neon_mmm_f32_8x4_generic 25 32 5 0.000011305023296577953 +armv7neon_mmm_f32_8x4_cortexa7 23 128 13 0.0000641420493251813 +armv7neon_mmm_f32_8x6_cortexa9 17 4 13 0.0000067882601578195375 +armv7neon_mmm_f32_8x6_cortexa9 24 32 13 0.00001554482246687919 +armv7neon_mmm_f32_8x4_cortexa7 15 128 12 0.00003239866815806022 +armv7neon_mmm_f32_8x6_cortexa9 23 4 13 0.000007410621365009766 +armv7neon_mmm_f32_8x4_cortexa9 17 128 5 0.000023718649742852314 +armv7neon_mmm_f32_8x6_cortexa9 15 32 13 0.000011397983577281118 +armv7neon_mmm_f32_8x6_cortexa7 15 32 5 0.0000050262586648579474 +armv7neon_mmm_f32_8x4_cortexa9 16 128 9 0.000023527233248412315 +armv7neon_mmm_f32_8x6_cortexa7 17 4 17 0.000007344312997274551 +armv7neon_mmm_f32_8x6_cortexa7 15 128 13 0.00004174083709909147 +armv7neon_mmm_f32_8x4_cortexa9 24 128 11 0.00003518379984445077 +armv7neon_mmm_f32_8x6_cortexa7 15 4 6 0.0000022388984932072717 +armv7neon_mmm_f32_8x6_generic 17 32 11 0.000010862418538636322 +armv7neon_mmm_f32_8x6_cortexa9 17 128 19 0.00006205059984963943 +armv7neon_mmm_f32_8x6_cortexa7 24 128 7 0.00004110240285622754 +armv7neon_mmm_f32_8x6_generic 25 32 11 0.00001430805053910281 +armv7neon_mmm_f32_8x4_cortexa9 23 128 11 0.00003562503787212338 +armv7neon_mmm_f32_8x4_generic 8 128 13 0.00001577265144197705 +armv7neon_mmm_f32_8x4_cortexa9 15 32 3 0.0000034017274319352206 +generic_f32_4x4 13 32 3 0.000005959878460430609 +armv7neon_mmm_f32_8x4_cortexa9 8 32 9 0.000004432308412032044 +armv7neon_mmm_f32_8x4_cortexa7 8 4 5 0.0000018214760004994945 +armv7neon_mmm_f32_8x4_generic 16 4 5 0.0000029127359283609326 +armv7neon_mmm_f32_8x4_cortexa7 8 32 11 0.000005611833179795977 +armv7neon_mmm_f32_8x4_generic 17 128 12 0.00003481391410825688 +armv7neon_mmm_f32_8x6_cortexa7 17 4 19 0.00000928821589263059 +armv7neon_mmm_f32_8x6_generic 8 4 19 0.00000313513885546113 +generic_f32_4x4 11 32 11 0.000012240954381721343 +armv7neon_mmm_f32_8x4_cortexa9 9 128 8 0.000015834005248731945 +armv7neon_mmm_f32_8x6_generic 25 128 7 0.00004137887977916487 +armv7neon_mmm_f32_8x6_cortexa7 25 32 13 0.000025709475387022286 +armv7neon_mmm_f32_8x6_cortexa7 24 32 5 0.000007312970559712554 +armv7neon_mmm_f32_8x4_cortexa9 7 32 3 0.0000019121426586981612 +armv7neon_mmm_f32_8x4_cortexa9 8 4 12 0.0000020507155861457753 +armv7neon_mmm_f32_8x4_cortexa9 25 128 4 0.000015694825557140796 +armv7neon_mmm_f32_8x6_cortexa9 9 4 17 0.0000048654285121987666 +armv7neon_mmm_f32_8x4_generic 15 32 13 0.000011594181812430167 +generic_f32_4x4 8 4 5 0.0000029234671714958714 +armv7neon_mmm_f32_8x6_generic 25 128 18 0.00006398210071667638 +armv7neon_mmm_f32_8x4_generic 25 32 7 0.000011481869355677313 +armv7neon_mmm_f32_8x6_cortexa7 15 32 13 0.000013835762311238379 +armv7neon_mmm_f32_8x6_generic 7 32 7 0.000004270239691930375 +armv7neon_mmm_f32_8x4_cortexa9 24 4 12 0.0000052513903304867065 +armv7neon_mmm_f32_8x4_cortexa7 7 128 13 0.000022090489564676277 +armv7neon_mmm_f32_8x4_cortexa7 9 32 7 0.0000074510499136902 +armv7neon_mmm_f32_8x4_generic 15 128 11 0.00002405185440078953 +armv7neon_mmm_f32_8x6_cortexa7 24 128 18 0.00006059904552298362 +armv7neon_mmm_f32_8x4_cortexa9 24 4 8 0.0000036249029546823233 +armv7neon_mmm_f32_8x4_generic 17 4 5 0.000004186211638893127 +armv7neon_mmm_f32_8x4_cortexa7 15 128 5 0.00002190540588603946 +armv7neon_mmm_f32_8x6_cortexa7 9 32 5 0.000004913938240718097 +armv7neon_mmm_f32_8x4_cortexa7 25 128 4 0.000021428570118948462 +armv7neon_mmm_f32_8x6_generic 9 4 17 0.000004759335274456197 +armv7neon_mmm_f32_8x4_cortexa9 23 32 5 0.000008792076478618836 +armv7neon_mmm_f32_8x4_cortexa9 17 128 13 0.00004665611088212803 +generic_f32_4x4 11 128 12 0.00003275274568553372 +armv7neon_mmm_f32_8x6_cortexa7 7 128 12 0.000014620372931092478 +armv7neon_mmm_f32_8x6_generic 24 128 13 0.000046299674326571746 +generic_f32_4x4 12 4 9 0.000005632703760211813 +armv7neon_mmm_f32_8x6_cortexa7 15 32 6 0.000004965054605236343 +armv7neon_mmm_f32_8x6_cortexa9 7 128 17 0.000016538122105202102 +armv7neon_mmm_f32_8x6_cortexa9 17 128 7 0.00003112401925909139 +armv7neon_mmm_f32_8x4_cortexa7 7 32 5 0.0000040832664009078 +armv7neon_mmm_f32_8x4_cortexa9 25 4 3 0.000003352094433744027 +armv7neon_mmm_f32_8x4_cortexa7 17 4 4 0.0000023752236037367475 +armv7neon_mmm_f32_8x4_cortexa9 8 4 4 0.000000986252516977325 +armv7neon_mmm_f32_8x6_cortexa7 9 4 5 0.0000022147026524486433 +armv7neon_mmm_f32_8x6_cortexa9 24 4 13 0.000006569367159583456 +armv7neon_mmm_f32_8x6_cortexa9 25 32 13 0.000020818742897252734 +armv7neon_mmm_f32_8x4_cortexa9 15 32 12 0.000008751598473767715 +armv7neon_mmm_f32_8x4_cortexa7 7 4 4 0.000001249360695162341 +armv7neon_mmm_f32_8x6_generic 25 4 19 0.000011234098334898354 +generic_f32_4x4 7 4 9 0.000004262590318570047 +generic_f32_4x4 12 128 9 0.000032724926249473206 +armv7neon_mmm_f32_8x4_cortexa7 23 4 11 0.0000067800221845095914 +armv7neon_mmm_f32_8x4_cortexa7 23 4 13 0.00000857826635505948 +armv7neon_mmm_f32_8x4_generic 7 32 3 0.0000019119582227554076 +armv7neon_mmm_f32_8x6_cortexa9 25 4 12 0.000005714183543712814 +generic_f32_4x4 7 128 8 0.000014963178444176004 +armv7neon_mmm_f32_8x6_generic 23 4 11 0.000005289174016444985 +armv7neon_mmm_f32_8x4_generic 15 4 7 0.000003212917021733202 +armv7neon_mmm_f32_8x4_cortexa9 17 4 4 0.000002157304549786207 +armv7neon_mmm_f32_8x4_cortexa7 25 4 8 0.000005296217018377082 +armv7neon_mmm_f32_8x4_cortexa7 16 4 8 0.000002789622045679376 +armv7neon_mmm_f32_8x4_cortexa9 9 32 11 0.00000862398264754348 +armv7neon_mmm_f32_8x6_generic 16 4 18 0.000004182651246093671 +armv7neon_mmm_f32_8x4_cortexa9 7 4 7 0.0000019258225946178918 +armv7neon_mmm_f32_8x4_generic 7 32 13 0.0000063109264345399555 +armv7neon_mmm_f32_8x6_generic 15 32 7 0.000007649176449323901 +armv7neon_mmm_f32_8x4_cortexa7 9 4 13 0.00000566559429146675 +generic_f32_4x4 11 4 13 0.00000768804413911549 +armv7neon_mmm_f32_8x6_cortexa7 15 128 19 0.000055484906874277455 +armv7neon_mmm_f32_8x6_generic 8 128 19 0.000020785294814215668 +armv7neon_mmm_f32_8x4_cortexa7 23 32 7 0.000011137926376204002 +armv7neon_mmm_f32_8x4_cortexa9 25 32 4 0.000005634291322373054 +armv7neon_mmm_f32_8x4_cortexa7 16 4 11 0.0000043943657333441155 +generic_f32_4x4 3 128 4 0.000004206283247834624 +armv7neon_mmm_f32_8x6_cortexa9 23 32 17 0.000016635289447796483 +armv7neon_mmm_f32_8x6_cortexa7 23 4 5 0.0000032303127776050868 +generic_f32_4x4 7 128 7 0.000015111291354960824 +armv7neon_mmm_f32_8x6_cortexa7 8 128 19 0.000027387915918929853 +armv7neon_mmm_f32_8x4_cortexa7 24 32 9 0.000015681272955767534 +armv7neon_mmm_f32_8x6_cortexa7 15 32 7 0.000009363971820493224 +armv7neon_mmm_f32_8x6_generic 15 4 17 0.000005464908042174986 +generic_f32_4x4 11 4 4 0.000002266320385947867 +armv7neon_mmm_f32_8x4_cortexa7 7 4 5 0.0000019827556786453945 +armv7neon_mmm_f32_8x4_generic 7 4 5 0.0000018753720795687383 +armv7neon_mmm_f32_8x6_cortexa7 23 32 19 0.000026555363914009545 +generic_f32_4x4 12 32 11 0.000011904804943326893 +generic_f32_4x4 9 32 4 0.000004278605538448279 +armv7neon_mmm_f32_8x4_cortexa9 9 4 3 0.0000018374246302226002 +armv7neon_mmm_f32_8x4_generic 17 128 11 0.00003530676669688071 +armv7neon_mmm_f32_8x4_cortexa7 23 32 5 0.000011014184598781204 +armv7neon_mmm_f32_8x4_cortexa7 8 128 8 0.000010889817517963246 +armv7neon_mmm_f32_8x4_generic 24 128 5 0.000023658482563487358 +armv7neon_mmm_f32_8x4_generic 23 4 4 0.0000022865778486536074 +armv7neon_mmm_f32_8x6_cortexa7 16 32 7 0.000008963466531834949 +armv7neon_mmm_f32_8x4_cortexa7 16 128 11 0.00003220406926334493 +armv7neon_mmm_f32_8x4_cortexa9 7 4 9 0.000002612401333065891 +armv7neon_mmm_f32_8x6_generic 24 128 18 0.00004570910076801179 +armv7neon_mmm_f32_8x6_cortexa7 15 32 19 0.00001825949129411215 +armv7neon_mmm_f32_8x4_cortexa7 7 128 11 0.00001669033881154567 +armv7neon_mmm_f32_8x6_cortexa7 17 128 17 0.00006152323359147728 +armv7neon_mmm_f32_8x6_generic 23 4 12 0.0000049613667404571205 +armv7neon_mmm_f32_8x6_cortexa9 17 4 6 0.0000025304413562385254 +armv7neon_mmm_f32_8x6_generic 25 32 13 0.000020634077057855525 +generic_f32_4x4 3 128 8 0.000007895048382688004 +armv7neon_mmm_f32_8x6_generic 7 128 6 0.000005880420229157795 +armv7neon_mmm_f32_8x6_cortexa7 7 4 6 0.0000015435048498918165 +armv7neon_mmm_f32_8x6_cortexa9 15 4 12 0.0000038035548284711065 +armv7neon_mmm_f32_8x4_generic 16 128 13 0.00003105986968794038 +armv7neon_mmm_f32_8x6_generic 17 128 6 0.00001561934075445538 +armv7neon_mmm_f32_8x6_generic 23 32 5 0.000006038048456367719 +generic_f32_4x4 3 4 8 0.0000018887559388867697 +armv7neon_mmm_f32_8x6_cortexa7 17 128 5 0.000021122166031945488 +armv7neon_mmm_f32_8x4_cortexa7 17 4 12 0.000005984255865199454 +armv7neon_mmm_f32_8x6_cortexa9 8 4 17 0.00000262350755833462 +armv7neon_mmm_f32_8x6_generic 24 32 17 0.000015705211312591065 +armv7neon_mmm_f32_8x6_generic 23 32 7 0.00001103425525544337 +armv7neon_mmm_f32_8x6_generic 23 128 18 0.00004698363257869806 +armv7neon_mmm_f32_8x4_cortexa7 9 128 4 0.000010997242917041898 +generic_f32_4x4 12 32 9 0.000011855594903877275 +armv7neon_mmm_f32_8x4_cortexa7 8 32 4 0.000002107136592547487 +armv7neon_mmm_f32_8x4_cortexa7 9 32 4 0.0000038494164534325605 +generic_f32_4x4 8 32 4 0.0000029596097342538777 +armv7neon_mmm_f32_8x6_cortexa9 7 4 13 0.0000033183395006378194 +armv7neon_mmm_f32_8x4_cortexa9 7 128 5 0.00000837292374457436 +armv7neon_mmm_f32_8x6_cortexa7 9 32 12 0.000008879244649291164 +armv7neon_mmm_f32_8x6_cortexa9 23 32 5 0.00000608726564780247 +generic_f32_4x4 4 4 11 0.000002253269465041155 +armv7neon_mmm_f32_8x6_generic 15 4 12 0.000003737614965115773 +armv7neon_mmm_f32_8x4_cortexa7 8 32 13 0.000007179427212160366 +armv7neon_mmm_f32_8x6_generic 7 4 7 0.00000227244051714339 +armv7neon_mmm_f32_8x6_generic 7 128 13 0.000016430484681121403 +armv7neon_mmm_f32_8x4_cortexa9 25 4 13 0.000009867842979190694 +armv7neon_mmm_f32_8x4_cortexa9 24 32 7 0.000008714828341166468 +armv7neon_mmm_f32_8x6_generic 7 128 12 0.00001125899429533851 +armv7neon_mmm_f32_8x4_cortexa7 24 128 3 0.000016779227724494087 +armv7neon_mmm_f32_8x6_cortexa7 25 32 11 0.000017700250695390733 +armv7neon_mmm_f32_8x6_cortexa9 8 128 17 0.00001578057628845984 +armv7neon_mmm_f32_8x6_cortexa9 15 128 18 0.00003176376886582956 +armv7neon_mmm_f32_8x6_generic 7 32 18 0.000006481310483268657 +armv7neon_mmm_f32_8x6_generic 24 32 19 0.000020260301856480532 +armv7neon_mmm_f32_8x6_cortexa7 25 4 7 0.000006556992855989281 +armv7neon_mmm_f32_8x4_cortexa7 9 4 8 0.0000030004666340701197 +armv7neon_mmm_f32_8x6_generic 8 4 13 0.0000025029052985754286 +armv7neon_mmm_f32_8x4_cortexa7 16 32 13 0.00001386814401860724 +armv7neon_mmm_f32_8x4_generic 16 128 3 0.000008461514063719399 +generic_f32_4x4 8 32 5 0.000005666217550078829 +armv7neon_mmm_f32_8x4_generic 17 32 9 0.000012567178399696468 +armv7neon_mmm_f32_8x4_cortexa7 15 128 9 0.00003252514906200416 +armv7neon_mmm_f32_8x6_generic 25 128 13 0.00006413295578186804 +armv7neon_mmm_f32_8x6_generic 7 4 11 0.0000023405037923286433 +generic_f32_4x4 7 32 12 0.000008239110132919408 +armv7neon_mmm_f32_8x6_cortexa9 23 128 6 0.00001597150399238562 +armv7neon_mmm_f32_8x4_generic 25 4 13 0.000009863674729272788 +armv7neon_mmm_f32_8x4_cortexa7 16 128 5 0.00002172651909764639 +armv7neon_mmm_f32_8x6_cortexa7 15 32 12 0.000009468415752921276 +armv7neon_mmm_f32_8x6_cortexa9 24 4 11 0.000004968600626897013 +armv7neon_mmm_f32_8x4_generic 24 128 13 0.00004635289274725685 +armv7neon_mmm_f32_8x6_generic 23 128 5 0.000016196412852382603 +armv7neon_mmm_f32_8x6_cortexa7 16 32 13 0.000013024830178376121 +armv7neon_mmm_f32_8x4_cortexa9 9 32 3 0.0000033151175658832473 +armv7neon_mmm_f32_8x6_generic 7 4 13 0.000003265379138781528 +armv7neon_mmm_f32_8x4_cortexa9 17 32 13 0.000016459192383496957 +armv7neon_mmm_f32_8x6_cortexa7 24 32 13 0.000019272536127977423 +generic_f32_4x4 4 32 3 0.0000019007760051192194 +generic_f32_4x4 9 128 8 0.000021954328754919918 +armv7neon_mmm_f32_8x4_cortexa7 8 32 12 0.000005371382475600821 +generic_f32_4x4 7 4 3 0.000001903414095718297 +armv7neon_mmm_f32_8x6_generic 24 4 7 0.000004666010098240403 +armv7neon_mmm_f32_8x4_cortexa9 16 32 8 0.000005563656358158919 +generic_f32_4x4 3 32 9 0.000004602770059818119 +armv7neon_mmm_f32_8x4_generic 9 32 13 0.000011194068458304182 +armv7neon_mmm_f32_8x4_cortexa9 15 32 11 0.000008943973907120027 +generic_f32_4x4 11 128 7 0.000022328923764966492 +armv7neon_mmm_f32_8x6_generic 8 128 5 0.000005731544283554465 +armv7neon_mmm_f32_8x6_cortexa9 8 128 11 0.00001076946555010939 +armv7neon_mmm_f32_8x4_cortexa7 7 128 5 0.000011229094508072489 +armv7neon_mmm_f32_8x4_cortexa9 15 128 9 0.000023961524252030835 +generic_f32_4x4 11 128 5 0.000022274598932900346 +generic_f32_4x4 5 128 3 0.000007857636386132928 +generic_f32_4x4 4 128 7 0.00000774651849385228 +armv7neon_mmm_f32_8x4_cortexa9 15 128 5 0.000016175167728379695 +armv7neon_mmm_f32_8x6_cortexa7 17 4 5 0.0000031159889361188713 +generic_f32_4x4 12 4 11 0.000005685376438998431 +armv7neon_mmm_f32_8x4_cortexa7 8 32 5 0.000003934735618425319 +generic_f32_4x4 4 32 4 0.0000017328412461691005 +armv7neon_mmm_f32_8x4_cortexa7 8 128 11 0.000016332637899779713 +armv7neon_mmm_f32_8x6_cortexa9 15 32 17 0.000011527836971780865 +armv7neon_mmm_f32_8x4_generic 15 4 9 0.000004398255246018968 +armv7neon_mmm_f32_8x4_cortexa9 15 32 8 0.000005965121319547414 +armv7neon_mmm_f32_8x6_cortexa7 8 4 11 0.0000020902150325449643 +armv7neon_mmm_f32_8x6_cortexa7 15 4 13 0.000005648201530441972 +armv7neon_mmm_f32_8x6_cortexa9 23 128 18 0.00004690308298866514 +armv7neon_mmm_f32_8x6_cortexa7 17 128 18 0.00006100420785992372 +generic_f32_4x4 3 32 3 0.000001876030465879534 +generic_f32_4x4 4 128 8 0.000007586370052411088 +armv7neon_mmm_f32_8x6_cortexa9 8 128 18 0.000015532022218663232 +armv7neon_mmm_f32_8x4_cortexa7 17 4 11 0.000006454744732596089 +armv7neon_mmm_f32_8x6_cortexa9 15 4 11 0.0000038999665208551116 +armv7neon_mmm_f32_8x6_cortexa9 25 128 17 0.00006218409021295646 +armv7neon_mmm_f32_8x6_cortexa7 9 128 7 0.000027592791728379908 +armv7neon_mmm_f32_8x4_cortexa7 17 128 7 0.0000324174579838398 +armv7neon_mmm_f32_8x6_generic 17 32 13 0.00001566115346007812 +generic_f32_4x4 9 4 4 0.0000022019063671444208 +armv7neon_mmm_f32_8x4_cortexa9 17 128 11 0.0000353036171900936 +armv7neon_mmm_f32_8x4_cortexa7 17 32 12 0.0000154170992202297 +generic_f32_4x4 4 4 12 0.0000020915219079462592 +armv7neon_mmm_f32_8x4_generic 16 4 3 0.000001964189345127758 +generic_f32_4x4 11 4 3 0.0000025875707471480883 +armv7neon_mmm_f32_8x4_cortexa7 24 32 8 0.000010240674992864977 +armv7neon_mmm_f32_8x4_cortexa7 24 4 7 0.00000464383472294094 +armv7neon_mmm_f32_8x6_generic 24 128 17 0.00004653762565754179 +armv7neon_mmm_f32_8x4_cortexa9 9 128 5 0.000015984472800286198 +armv7neon_mmm_f32_8x6_cortexa9 17 32 13 0.000015793322899399953 +armv7neon_mmm_f32_8x4_cortexa7 23 128 9 0.00004832080937818461 +armv7neon_mmm_f32_8x6_generic 17 4 12 0.0000043886752215018795 +armv7neon_mmm_f32_8x6_generic 24 4 17 0.000006718970310010203 +armv7neon_mmm_f32_8x6_cortexa9 23 4 12 0.000005053037256901505 +armv7neon_mmm_f32_8x4_generic 25 128 13 0.00006314133455535656 +generic_f32_4x4 9 128 7 0.000022234980219629062 +armv7neon_mmm_f32_8x6_cortexa7 7 128 7 0.000014376478927807395 +armv7neon_mmm_f32_8x6_cortexa7 16 128 5 0.0000143212403106337 +armv7neon_mmm_f32_8x6_generic 25 32 6 0.000007045905360966285 +armv7neon_mmm_f32_8x6_generic 25 32 17 0.000020892213993308648 +armv7neon_mmm_f32_8x6_cortexa9 24 32 7 0.000010711106258433335 +armv7neon_mmm_f32_8x4_cortexa7 15 32 7 0.000007644594488957809 +armv7neon_mmm_f32_8x4_generic 9 32 7 0.0000059612924557323865 +armv7neon_mmm_f32_8x4_cortexa7 24 32 5 0.000010813567327083286 +generic_f32_4x4 11 32 9 0.000012181640310257489 +armv7neon_mmm_f32_8x4_generic 8 32 11 0.0000044856528446547376 +armv7neon_mmm_f32_8x6_cortexa7 8 128 5 0.00000740698393210008 +armv7neon_mmm_f32_8x6_generic 9 4 13 0.00000467415217244372 +armv7neon_mmm_f32_8x4_cortexa9 8 32 5 0.0000031557846756887957 +armv7neon_mmm_f32_8x6_generic 9 32 6 0.000003833293115622011 +generic_f32_4x4 5 128 11 0.000022197811800866368 +armv7neon_mmm_f32_8x4_cortexa7 8 4 8 0.000001630228451378558 +armv7neon_mmm_f32_8x6_cortexa7 7 128 18 0.00002168993830362326 +armv7neon_mmm_f32_8x4_generic 16 4 7 0.000003012988093329698 +armv7neon_mmm_f32_8x6_generic 9 32 5 0.0000040586320817732015 +armv7neon_mmm_f32_8x4_generic 8 128 11 0.000012020324272970633 +armv7neon_mmm_f32_8x4_cortexa9 9 4 13 0.00000525401614258308 +armv7neon_mmm_f32_8x6_cortexa9 17 32 5 0.000005959198344084862 +armv7neon_mmm_f32_8x6_cortexa9 25 128 7 0.00004136188393626292 +armv7neon_mmm_f32_8x6_cortexa7 8 4 6 0.0000011677036715350285 +armv7neon_mmm_f32_8x6_cortexa7 7 32 11 0.000005167860829086469 +armv7neon_mmm_f32_8x4_generic 24 4 11 0.000005847812768700571 +armv7neon_mmm_f32_8x4_generic 17 128 8 0.0000233987853798507 +armv7neon_mmm_f32_8x4_cortexa9 7 128 8 0.000008435297227106425 +armv7neon_mmm_f32_8x4_generic 8 4 4 0.000000981928574441578 +generic_f32_4x4 13 32 8 0.000010469226231367714 +generic_f32_4x4 12 4 7 0.000004168142600643075 +armv7neon_mmm_f32_8x6_generic 25 128 5 0.000021400005561991285 +armv7neon_mmm_f32_8x6_cortexa9 23 128 19 0.00006309986803011645 +armv7neon_mmm_f32_8x6_cortexa7 17 32 5 0.000007188792393610533 +armv7neon_mmm_f32_8x6_cortexa9 7 32 17 0.00000637568429598606 +armv7neon_mmm_f32_8x6_cortexa9 25 32 19 0.000027309010551769435 +armv7neon_mmm_f32_8x4_cortexa7 7 32 8 0.000004145572586004413 +armv7neon_mmm_f32_8x4_cortexa9 24 32 9 0.000012468417684719209 +armv7neon_mmm_f32_8x6_cortexa7 23 128 7 0.00004157442562492112 diff --git a/vendor/tract-linalg-0.22.1/src/arm64.rs b/vendor/tract-linalg-0.22.1/src/arm64.rs new file mode 100644 index 000000000..f44c38a63 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64.rs @@ -0,0 +1,383 @@ +#![allow(clippy::excessive_precision)] +#[cfg(any(target_os = "macos", all(target_os = "ios", feature = "apple-amx-ios")))] +mod apple_amx; +mod arm64simd; +pub mod cortex_a53; +mod cortex_a55; +//mod cortex_a72; +//mod cortex_a73; +pub use arm64simd::*; + +#[cfg(not(feature = "no_fp16"))] +pub mod arm64fp16; +#[cfg(not(feature = "no_fp16"))] +pub use arm64fp16::*; + +use crate::f16; +use crate::{BinOp, DatumType, LinalgRegistry, Ops}; + +use crate::frame::by_scalar::ByScalarKer; +use crate::frame::element_wise::ElementWiseKer; +use crate::frame::reduce::{MapReduceKer, ReduceKer}; +use crate::frame::unicast::UnicastKer; + +// https://en.wikipedia.org/wiki/Comparison_of_ARMv8-A_cores +const PART_A53: &str = "0xd03"; +const PART_A55: &str = "0xd05"; +#[allow(dead_code)] +const PART_A72: &str = "0xd08"; +#[allow(dead_code)] +const PART_A73: &str = "0xd09"; +#[allow(dead_code)] +const PART_A75: &str = "0xd0a"; +#[allow(dead_code)] +const PART_NEOVERSE_N1: &str = "0xd0c"; +#[allow(dead_code)] +const PART_NEOVERSE_N2: &str = "0xd49"; +#[allow(dead_code)] +const PART_NEOVERSE_N3: &str = "0xd8e"; +#[allow(dead_code)] +const PART_NEOVERSE_V1: &str = "0xd40"; +#[allow(dead_code)] +const PART_NEOVERSE_V2: &str = "0xd4f"; +#[allow(dead_code)] +const PART_NEOVERSE_V3: &str = "0xd83"; + +fn max_cpuid() -> std::io::Result { + let cpu_info = std::fs::read_to_string("/proc/cpuinfo")?; + let max = cpu_info + .lines() + .filter(|line| line.starts_with("CPU part")) + .map(|line| line.split_whitespace().last().unwrap_or("")) + .max(); + Ok(max.unwrap_or("").to_string()) +} + +lazy_static::lazy_static! { + static ref KIND: Kind = Kind::choose(); + + static ref CPU_FEATURES: Vec = { + #[cfg(test)] crate::setup_test_logger(); + let Ok(cpu_info) = std::fs::read_to_string("/proc/cpuinfo") else { + log::warn!("Could not read /proc/cpuinfo. CPU Features detection may be impaired."); + return vec!(); + }; + if let Some(line) = cpu_info + .lines() + .find(|line| line.starts_with("Features")) { + line.split_once(':').unwrap().1.split_whitespace().map(|s| s.to_string()).collect() + } else { + log::warn!("Could not find \"Features :\" lines in /proc/cpuinfo. CPU Features detection may be impaired."); + vec!() + } + }; + + static ref HAS_FP16: bool = { + CPU_FEATURES.iter().any(|s| &**s == "asimdhp") + }; +} + +#[cfg(any(target_os = "macos", target_os = "ios"))] +fn apple_get_syscall(key: &str) -> String { + use std::ffi::{c_char, c_void, CStr, CString}; + use std::ptr::null_mut; + + unsafe extern "C" { + fn sysctlbyname( + name: *const c_char, + oldp: *mut c_void, + oldlenp: *mut isize, + newp: *mut c_void, + newlen: isize, + ); + } + + unsafe { + let mut len: isize = 0; + let name = CString::new(key).unwrap(); + sysctlbyname(name.as_ptr(), null_mut(), &mut len, null_mut(), 0); + let mut buf = vec![0u8; len as _]; + sysctlbyname(name.as_ptr(), buf.as_mut_ptr() as _, &mut len, null_mut(), 0); + CStr::from_bytes_with_nul(&buf).unwrap().to_string_lossy().into_owned() + } +} + +#[cfg(target_os = "macos")] +pub fn has_amx() -> bool { + !apple_get_syscall("machdep.cpu.brand_string").contains("(Virtual)") +} + +#[cfg(target_os = "ios")] +lazy_static::lazy_static! { + static ref IPHONE_MODEL_MAJOR:Option = { + let version = apple_get_syscall("hw.machine"); + let Some((major, _)) = version.trim_start_matches("iPhone").split_once(",") else { return None }; + major.parse::().ok() + }; +} + +#[cfg(all(target_os = "ios", feature = "apple-amx-ios"))] +fn has_amx() -> bool { + // iPhone12,1 is the one branded "iPhone 11", with Apple A13 bionic, first CPU featuring amx + IPHONE_MODEL_MAJOR.map(|it| it >= 12).unwrap_or(false) +} + +#[inline] +#[cfg(target_os = "ios")] +pub fn has_fp16() -> bool { + // iPhone10,1 is the one branded "iPhone 8", with Apple A11 bionic, first CPU featuring fp16 + IPHONE_MODEL_MAJOR.map(|it| it >= 10).unwrap_or(false) +} + +#[inline] +#[cfg(not(target_os = "ios"))] +pub fn has_fp16() -> bool { + cfg!(target_os = "macos") + || cfg!(feature_cpu = "fp16") + || *KIND == Kind::CortexA55 + || *KIND == Kind::CortexA75 + || *HAS_FP16 +} + +#[target_feature(enable = "fp16")] +#[inline] +pub unsafe fn add_f16(a: f16, b: f16) -> f16 { + unsafe { + let result: u16; + std::arch::asm!( + "fadd {0:h}, {1:h}, {2:h}", + lateout(vreg) result, + in(vreg) a.to_bits(), + in(vreg) b.to_bits(), + options(pure, nomem, nostack, preserves_flags)); + f16::from_bits(result) + } +} + +#[target_feature(enable = "fp16")] +#[inline] +pub unsafe fn mul_f16(a: f16, b: f16) -> f16 { + unsafe { + let result: u16; + std::arch::asm!( + "fmul {0:h}, {1:h}, {2:h}", + lateout(vreg) result, + in(vreg) a.to_bits(), + in(vreg) b.to_bits(), + options(pure, nomem, nostack, preserves_flags)); + f16::from_bits(result) + } +} + +#[derive(Debug, PartialEq, Eq, Copy, Clone)] +pub enum Kind { + Generic, + AppleM, + Neoverse, + CortexA53, + CortexA55, + CortexA72, + CortexA73, + CortexA75, +} + +impl Kind { + pub fn choose() -> Kind { + #[cfg(test)] + crate::setup_test_logger(); + let kind = if let Ok(kind) = std::env::var("TRACT_CPU_AARCH64_KIND") { + log::info!("CPU kind forced with TRACT_CPU_AARCH64_KIND: {}", kind); + let kind = kind.to_lowercase(); + if kind.contains("a53") { + Kind::CortexA53 + } else if kind.contains("a55") { + Kind::CortexA55 + } else if kind.contains("a72") { + Kind::CortexA72 + } else if kind.contains("a73") { + Kind::CortexA73 + } else if kind.contains("a75") { + Kind::CortexA75 + } else if kind.contains("neoverse") { + Kind::Neoverse + } else if kind.contains("applem") { + Kind::AppleM + } else { + Kind::Generic + } + } else if cfg!(target_os = "macos") { + Kind::AppleM + } else { + let part = if let Ok(part) = std::env::var("TRACT_CPU_AARCH64_OVERRIDE_CPU_PART") { + log::info!("CPU part forced with TRACT_CPU_AARCH64_OVERRIDE_CPU_PART: {}", part); + part + } else if cfg!(target_os = "linux") { + let part = max_cpuid().unwrap_or_else(|_| "0x00".to_string()); + log::info!("CPU part auto detected: {}", part); + part + } else { + log::info!("Unknown CPU part"); + "0x00".to_string() + }; + match &*part { + PART_A53 => Kind::CortexA53, + PART_A55 => Kind::CortexA55, + PART_A72 => Kind::CortexA72, + PART_A73 => Kind::CortexA73, + PART_A75 => Kind::CortexA75, + PART_NEOVERSE_N1 | PART_NEOVERSE_N2 | PART_NEOVERSE_N3 | PART_NEOVERSE_V1 + | PART_NEOVERSE_V2 | PART_NEOVERSE_V3 => Kind::Neoverse, + _ => Kind::Generic, + } + }; + log::info!("CPU optimisation: {:?}", kind); + kind + } +} + +pub(crate) fn register_all_unicast(registry: &mut LinalgRegistry) { + registry + .insert((BinOp::Mul, DatumType::F32), Box::new(|| arm64simd_unicast_mul_f32_16n::bin())); + registry + .insert((BinOp::Mul, DatumType::F16), Box::new(|| arm64fp16_unicast_mul_f16_32n::bin())); + registry + .insert((BinOp::Add, DatumType::F32), Box::new(|| arm64simd_unicast_add_f32_16n::bin())); + registry + .insert((BinOp::Add, DatumType::F16), Box::new(|| arm64fp16_unicast_add_f16_32n::bin())); + registry + .insert((BinOp::Sub, DatumType::F32), Box::new(|| arm64simd_unicast_sub_f32_16n::bin())); + registry + .insert((BinOp::Sub, DatumType::F16), Box::new(|| arm64fp16_unicast_sub_f16_32n::bin())); + registry + .insert((BinOp::SubF, DatumType::F32), Box::new(|| arm64simd_unicast_subf_f32_16n::bin())); + registry + .insert((BinOp::SubF, DatumType::F16), Box::new(|| arm64fp16_unicast_subf_f16_32n::bin())); + registry + .insert((BinOp::Min, DatumType::F32), Box::new(|| arm64simd_unicast_min_f32_16n::bin())); + registry + .insert((BinOp::Min, DatumType::F16), Box::new(|| arm64fp16_unicast_min_f16_32n::bin())); + registry + .insert((BinOp::Max, DatumType::F32), Box::new(|| arm64simd_unicast_max_f32_16n::bin())); + registry + .insert((BinOp::Max, DatumType::F16), Box::new(|| arm64fp16_unicast_max_f16_32n::bin())); +} + +pub(crate) fn register_all_by_scalar(registry: &mut LinalgRegistry) { + registry + .insert((BinOp::Mul, DatumType::F32), Box::new(|| arm64simd_mul_by_scalar_f32_16n::bin())); + registry + .insert((BinOp::Mul, DatumType::F16), Box::new(|| arm64fp16_mul_by_scalar_f16_32n::bin())); + registry + .insert((BinOp::Add, DatumType::F32), Box::new(|| arm64simd_add_by_scalar_f32_16n::bin())); + registry + .insert((BinOp::Add, DatumType::F16), Box::new(|| arm64fp16_add_by_scalar_f16_32n::bin())); + registry + .insert((BinOp::Sub, DatumType::F32), Box::new(|| arm64simd_sub_by_scalar_f32_16n::bin())); + registry + .insert((BinOp::Sub, DatumType::F16), Box::new(|| arm64fp16_sub_by_scalar_f16_32n::bin())); + registry.insert( + (BinOp::SubF, DatumType::F32), + Box::new(|| arm64simd_subf_by_scalar_f32_16n::bin()), + ); + registry.insert( + (BinOp::SubF, DatumType::F16), + Box::new(|| arm64fp16_subf_by_scalar_f16_32n::bin()), + ); + registry + .insert((BinOp::Min, DatumType::F32), Box::new(|| arm64simd_min_by_scalar_f32_16n::bin())); + registry + .insert((BinOp::Min, DatumType::F16), Box::new(|| arm64fp16_min_by_scalar_f16_32n::bin())); + registry + .insert((BinOp::Max, DatumType::F32), Box::new(|| arm64simd_max_by_scalar_f32_16n::bin())); + registry + .insert((BinOp::Max, DatumType::F16), Box::new(|| arm64fp16_max_by_scalar_f16_32n::bin())); +} + +pub fn plug(ops: &mut Ops) { + arm64simd::plug(ops); + + #[cfg(not(feature = "no_fp16"))] + if has_fp16() { + arm64fp16::plug(ops); + } + + ops.qmmm_i32 = Box::new(|_, _, _| arm64simd_mmm_i32_8x8.mmm()); + ops.qmmv_i32 = Box::new(|_, _| arm64simd_mmm_i32_64x1.mmm()); + ops.mmv_f32 = match *KIND { + Kind::CortexA53 => Box::new(|_, _| arm64simd_mmm_f32_64x1_a53.mmm()), + Kind::CortexA55 => Box::new(|_, _| arm64simd_mmm_f32_64x1_a55.mmm()), + _ => Box::new(|_, _| arm64simd_mmm_f32_64x1_gen.mmm()), + }; + let model = match *KIND { + Kind::CortexA53 => Some(cortex_a53::model()), + Kind::CortexA55 => Some(cortex_a55::model()), + _ => None, + }; + let impls = ops.mmm_impls.clone(); + ops.mmm_f32 = if let Some(model) = model { + Box::new(move |m, k, n| model.pick(&impls, m, k, n)) + } else { + Box::new(move |_, _, n| { + if n.unwrap_or(8) < 8 { + arm64simd_mmm_f32_16x4_gen.mmm() + } else { + arm64simd_mmm_f32_8x8_gen.mmm() + } + }) + }; + #[cfg(feature = "no_fp16")] + if has_fp16() { + log::warn!( + "This is a build with fp16 disabled, while your platform CPU seems to support it." + ); + } + #[cfg(not(feature = "no_fp16"))] + if has_fp16() { + if *KIND == Kind::CortexA55 { + log::info!("Cortex-A55 mmm_f16 and mmv_f16 activated"); + ops.mmm_f16 = Box::new(|_, _, n| { + use tract_data::internal::DimLike; + if n.unwrap_or(1024).divceil(4) * 4 < n.unwrap_or(1024).divceil(8) * 8 { + arm64fp16_mmm_f16_32x4_a55.mmm() + } else { + arm64fp16_mmm_f16_16x8_a55.mmm() + } + }); + ops.mmv_f16 = Box::new(|_, _| arm64fp16_mmm_f16_128x1_a55.mmm()); + } else { + log::info!("ARMv8.2 mmm_f16 and mmv_f16 activated"); + ops.mmm_f16 = Box::new(|_, _, n| { + use tract_data::internal::DimLike; + if n.unwrap_or(1024).divceil(4) * 4 < n.unwrap_or(1024).divceil(8) * 8 { + arm64fp16_mmm_f16_32x4_gen.mmm() + } else { + arm64fp16_mmm_f16_16x8_gen.mmm() + } + }); + ops.mmv_f16 = Box::new(|_, _| arm64fp16_mmm_f16_128x1_gen.mmm()); + } + } + ops.leaky_relu_f32 = Box::new(|| arm64simd_leaky_relu_f32_8n::ew()); + ops.sigmoid_f32 = Box::new(|| arm64simd_sigmoid_f32_4n::ew()); + ops.tanh_f32 = Box::new(|| arm64simd_tanh_f32_4n::ew()); + ops.max_f32 = Box::new(|| arm64simd_max_f32_16n::red()); + ops.sum_f32 = Box::new(|| arm64simd_sum_f32_16n::red()); + ops.mul_by_scalar_f32 = Box::new(|| arm64simd_mul_by_scalar_f32_16n::ew()); + ops.softmax2_fastcompact_f32 = Box::new(|| arm64simd_softmax2_fastcompact_f32_16n::red()); + #[cfg(not(feature = "no_fp16"))] + if has_fp16() { + log::info!("ARMv8.2 tanh_f16 and sigmoid_f16 activated"); + ops.leaky_relu_f16 = Box::new(|| arm64fp16_leaky_relu_f16_16n::ew()); + ops.tanh_f16 = Box::new(|| arm64fp16_tanh_f16_8n::ew()); + ops.sigmoid_f16 = Box::new(|| arm64fp16_sigmoid_f16_8n::ew()); + ops.max_f16 = Box::new(|| arm64fp16_max_f16_32n::red()); + ops.sum_f16 = Box::new(|| arm64fp16_sum_f16_32n::red()); + ops.mul_by_scalar_f16 = Box::new(|| arm64fp16_mul_by_scalar_f16_32n::ew()); + } else { + log::info!("No native fp16 support"); + } + #[cfg(any(target_os = "macos", all(target_os = "ios", feature = "apple-amx-ios")))] + { + apple_amx::plug(ops); + } +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/apple_amx.rs b/vendor/tract-linalg-0.22.1/src/arm64/apple_amx.rs new file mode 100644 index 000000000..512c65322 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/apple_amx.rs @@ -0,0 +1,32 @@ +use crate::frame::mmm::ImplementationQuality::ManuallyOptimized; +use crate::mmm::*; +use crate::Ops; +use tract_data::prelude::*; + +use super::has_amx; + +const AMX: fn() -> bool = crate::arm64::has_amx; +const CAN_FUSE: fn(&FusedSpec) -> bool = |f| !matches!(f, &FusedSpec::LeakyRelu(_)); + +MMMExternKernel!(apple_amx_mmm_f32_32x32(32, 32)@(128, 128) where(AMX) can_fuse(CAN_FUSE) quality(ManuallyOptimized)); +MMMExternKernel!(apple_amx_mmm_f32_32x1(32, 1)@(128, 128) where(AMX) can_fuse(CAN_FUSE) quality(ManuallyOptimized)); +MMMExternKernel!(apple_amx_mmm_f16_64x32(64, 32)@(128, 128) where(AMX) can_fuse(CAN_FUSE) quality(ManuallyOptimized)); +MMMExternKernel!(apple_amx_mmm_f16_64x1(64, 1)@(128, 128) where(AMX) can_fuse(CAN_FUSE) quality(ManuallyOptimized)); + +pub fn plug(ops: &mut Ops) { + if has_amx() { + log::info!("AMX optimisation activated"); + ops.mmm_f16 = Box::new(|_, _, _| apple_amx_mmm_f16_64x32.mmm()); + ops.mmm_f32 = Box::new(|_, _, _| apple_amx_mmm_f32_32x32.mmm()); + ops.mmv_f16 = Box::new(|_, _| apple_amx_mmm_f16_64x1.mmm()); + ops.mmv_f32 = Box::new(|_, _| apple_amx_mmm_f32_32x1.mmm()); + ops.mmm_impls.extend_from_slice(&[ + apple_amx_mmm_f32_32x32.mmm(), + apple_amx_mmm_f32_32x1.mmm(), + apple_amx_mmm_f16_64x32.mmm(), + apple_amx_mmm_f16_64x1.mmm(), + ]); + } else { + log::info!("No AMX optimisation"); + } +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16.rs new file mode 100644 index 000000000..a09df594f --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16.rs @@ -0,0 +1,64 @@ +use tract_data::half::f16; + +mod by_scalar; +mod leaky_relu; +mod max; +pub mod panel_extract; +mod sum; +mod unicast; +pub use by_scalar::*; +pub use leaky_relu::*; +pub use max::*; +pub use sum::*; +pub use unicast::*; + +use crate::block_quant::PackedBlockQuantFormat; +use crate::block_quant::Q4_0; +use crate::frame::mmm::ImplementationQuality::ManuallyOptimized; +use crate::Ops; + +const FP16: fn() -> bool = crate::arm64::has_fp16; + +MMMExternKernel!(arm64fp16_mmm_f16_16x8_gen(16, 8)@(16, 16) where(FP16) quality(ManuallyOptimized)); +MMMExternKernel!(arm64fp16_mmm_f16_16x8_a55(16, 8)@(16, 16) where(FP16) quality(ManuallyOptimized)); +MMMExternKernel!(arm64fp16_mmm_f16_32x4_gen(32, 4)@(16, 16) where(FP16) quality(ManuallyOptimized)); +MMMExternKernel!(arm64fp16_mmm_f16_32x4_a55(32, 4)@(16, 16) where(FP16) quality(ManuallyOptimized)); +MMMExternKernel!(arm64fp16_mmm_f16_128x1_gen(128,1)@(16, 16) where(FP16) quality(ManuallyOptimized)); +MMMExternKernel!(arm64fp16_mmm_f16_128x1_a55(128,1)@(16, 16) where(FP16) quality(ManuallyOptimized)); + +MMMExternKernel!(arm64fp16_mmm_f16_64x3_gen(64, 3)@(16, 16) where(FP16) quality(ManuallyOptimized)); +MMMExternKernel!(arm64fp16_mmm_f16_32x6_gen(32, 6)@(16, 16) where(FP16) quality(ManuallyOptimized)); + +MMMExternKernel! { arm64fp16_mmm_f16_64x1_gen(64, 1)@(16, 16) where(FP16) + packing[1] = q40f16z16se => |k| k.with_packing_a(PackedBlockQuantFormat::new(&Q4_0, 64, 16, true)); + packing[2] = q40f16z16 => |k| k.with_packing_a(PackedBlockQuantFormat::new(&Q4_0, 64, 16, false)); + quality(ManuallyOptimized) +} + +pub fn plug(ops: &mut Ops) { + panel_extract::plug(ops); + ops.mmm_impls.extend_from_slice(&[ + arm64fp16_mmm_f16_16x8_a55.mmm(), + arm64fp16_mmm_f16_16x8_gen.mmm(), + arm64fp16_mmm_f16_32x4_a55.mmm(), + arm64fp16_mmm_f16_32x4_gen.mmm(), + arm64fp16_mmm_f16_128x1_a55.mmm(), + arm64fp16_mmm_f16_128x1_gen.mmm(), + arm64fp16_mmm_f16_64x3_gen.mmm(), + arm64fp16_mmm_f16_32x6_gen.mmm(), + arm64fp16_mmm_f16_64x1_gen.mmm(), + ]); +} + +tanh_impl!(f16, arm64fp16_tanh_f16_8n, 8, 8, crate::arm64::has_fp16()); +sigmoid_impl!(f16, arm64fp16_sigmoid_f16_8n, 8, 8, crate::arm64::has_fp16()); + +#[cfg(test)] +mod test { + + #[test] + fn kits() { + let mut ops = crate::generic(); + super::plug(&mut ops); + } +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/by_scalar.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/by_scalar.rs new file mode 100644 index 000000000..e791890bd --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/by_scalar.rs @@ -0,0 +1,258 @@ +use crate::f16; + +by_scalar_impl_wrap!( + f16, + arm64fp16_mul_by_scalar_f16_32n, + 32, + 4, + f16, + fn run(buf: &mut [f16], s: f16) { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(buf: &mut [f16], s: f16) { + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + dup v0.8h, v0.h[0] + 2: + ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}] + fmul v4.8h, v4.8h, v0.8h + fmul v5.8h, v5.8h, v0.8h + fmul v6.8h, v6.8h, v0.8h + fmul v7.8h, v7.8h, v0.8h + st1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64 + subs {len}, {len}, 32 + bne 2b + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + in("v0") s.to_bits(), + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + } + } + unsafe { run(buf, s) } + } +); + +by_scalar_impl_wrap!( + f16, + arm64fp16_add_by_scalar_f16_32n, + 32, + 4, + f16, + fn run(buf: &mut [f16], s: f16) { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(buf: &mut [f16], s: f16) { + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + dup v0.8h, v0.h[0] + 2: + ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}] + fadd v4.8h, v4.8h, v0.8h + fadd v5.8h, v5.8h, v0.8h + fadd v6.8h, v6.8h, v0.8h + fadd v7.8h, v7.8h, v0.8h + st1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64 + subs {len}, {len}, 32 + bne 2b + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + in("v0") s.to_bits(), + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + } + } + unsafe { run(buf, s) } + } +); + +by_scalar_impl_wrap!( + f16, + arm64fp16_sub_by_scalar_f16_32n, + 32, + 4, + f16, + fn run(buf: &mut [f16], s: f16) { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(buf: &mut [f16], s: f16) { + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + dup v0.8h, v0.h[0] + 2: + ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}] + fsub v4.8h, v4.8h, v0.8h + fsub v5.8h, v5.8h, v0.8h + fsub v6.8h, v6.8h, v0.8h + fsub v7.8h, v7.8h, v0.8h + st1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64 + subs {len}, {len}, 32 + bne 2b + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + in("v0") s.to_bits(), + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + } + } + unsafe { run(buf, s) } + } +); + +by_scalar_impl_wrap!( + f16, + arm64fp16_subf_by_scalar_f16_32n, + 32, + 4, + f16, + fn run(buf: &mut [f16], s: f16) { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(buf: &mut [f16], s: f16) { + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + dup v0.8h, v0.h[0] + 2: + ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}] + fsub v4.8h, v0.8h, v4.8h + fsub v5.8h, v0.8h, v5.8h + fsub v6.8h, v0.8h, v6.8h + fsub v7.8h, v0.8h, v7.8h + st1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64 + subs {len}, {len}, 32 + bne 2b + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + in("v0") s.to_bits(), + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + } + } + unsafe { run(buf, s) } + } +); + +by_scalar_impl_wrap!( + f16, + arm64fp16_min_by_scalar_f16_32n, + 32, + 4, + f16, + fn run(buf: &mut [f16], s: f16) { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(buf: &mut [f16], s: f16) { + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + dup v0.8h, v0.h[0] + 2: + ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}] + fmin v4.8h, v4.8h, v0.8h + fmin v5.8h, v5.8h, v0.8h + fmin v6.8h, v6.8h, v0.8h + fmin v7.8h, v7.8h, v0.8h + st1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64 + subs {len}, {len}, 32 + bne 2b + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + in("v0") s.to_bits(), + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + } + } + unsafe { run(buf, s) } + } +); + +by_scalar_impl_wrap!( + f16, + arm64fp16_max_by_scalar_f16_32n, + 32, + 4, + f16, + fn run(buf: &mut [f16], s: f16) { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(buf: &mut [f16], s: f16) { + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + dup v0.8h, v0.h[0] + 2: + ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}] + fmax v4.8h, v4.8h, v0.8h + fmax v5.8h, v5.8h, v0.8h + fmax v6.8h, v6.8h, v0.8h + fmax v7.8h, v7.8h, v0.8h + st1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64 + subs {len}, {len}, 32 + bne 2b + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + in("v0") s.to_bits(), + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + } + } + unsafe { run(buf, s) } + } +); + +#[cfg(test)] +mod test_arm64fp16_mul_by_scalar_f16_32n { + use super::*; + by_scalar_frame_tests!( + crate::arm64::has_fp16(), + f16, + arm64fp16_mul_by_scalar_f16_32n, + |a, b| a * b + ); + by_scalar_frame_tests!( + crate::arm64::has_fp16(), + f16, + arm64fp16_add_by_scalar_f16_32n, + |a, b| a + b + ); + by_scalar_frame_tests!( + crate::arm64::has_fp16(), + f16, + arm64fp16_sub_by_scalar_f16_32n, + |a, b| a - b + ); + by_scalar_frame_tests!( + crate::arm64::has_fp16(), + f16, + arm64fp16_subf_by_scalar_f16_32n, + |a, b| b - a + ); + by_scalar_frame_tests!( + crate::arm64::has_fp16(), + f16, + arm64fp16_min_by_scalar_f16_32n, + |a, b| a.min(b) + ); + by_scalar_frame_tests!( + crate::arm64::has_fp16(), + f16, + arm64fp16_max_by_scalar_f16_32n, + |a, b| a.max(b) + ); +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/leaky_relu.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/leaky_relu.rs new file mode 100644 index 000000000..f4f6204aa --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/leaky_relu.rs @@ -0,0 +1,56 @@ +use tract_data::internal::f16; + +ew_impl_wrap!( + f16, + arm64fp16_leaky_relu_f16_16n, + 16, + 8, + f16, + #[inline(never)] + fn run(buf: &mut [f16], alpha: f16) { + assert!(buf.len() % 8 == 0); + assert!(buf.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(buf: &mut [f16], alpha: f16) { + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + dup v0.8h, {alpha:v}.h[0] + dup v1.8h, {one:v}.h[0] + 2: + ldp q3, q4, [{ptr}] + + fcmgt v5.8h, v3.8h, #0.0 + fcmgt v6.8h, v4.8h, #0.0 + bsl v5.16b, v1.16b, v0.16b + bsl v6.16b, v1.16b, v0.16b + fmul v3.8h, v3.8h, v5.8h + fmul v4.8h, v4.8h, v6.8h + + stp q3, q4, [{ptr}], #32 + subs {len}, {len}, 16 + bne 2b + ", + one = in(vreg) f16::from_f32(1.0f32).to_bits(), + alpha = in(vreg) alpha.to_bits(), + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + out("v0") _, + out("v1") _, + out("q3") _, + out("q4") _, + out("q5") _, + out("q6") _, + ); + } + } + unsafe { run(buf, alpha) } + } +); + +#[cfg(test)] +pub mod test_arm64simd_leaky_relu_f16_16n { + use super::*; + leaky_relu_frame_tests!(crate::arm64::has_fp16(), f16, arm64fp16_leaky_relu_f16_16n); +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/max.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/max.rs new file mode 100644 index 000000000..7a7b1033f --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/max.rs @@ -0,0 +1,63 @@ +use tract_data::half::f16; + +reduce_impl_wrap!( + f16, + arm64fp16_max_f16_32n, + 32, + 8, + (), + f16::MIN, + #[inline(never)] + fn run(buf: &[f16], _: ()) -> f16 { + assert!(buf.len() % 32 == 0); + assert!(buf.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(buf: &[f16]) -> f16 { + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + let mut out: u16; + std::arch::asm!(" + ins v0.h[0], {min:w} + dup v0.8h, v0.h[0] + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[0] + dup v3.8h, v0.h[0] + + 2: + ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64 + fmax v0.8h, v0.8h, v4.8h + fmax v1.8h, v1.8h, v5.8h + fmax v2.8h, v2.8h, v6.8h + fmax v3.8h, v3.8h, v7.8h + + subs {len}, {len}, 32 + bne 2b + + fmax v0.8h, v0.8h, v1.8h + fmax v2.8h, v2.8h, v3.8h + fmax v0.8h, v0.8h, v2.8h + fmaxv h0, v0.8h + ", + // using v0 as inout triggers https://github.com/rust-lang/rust/issues/120374 + min = in(reg) f16::MIN.to_bits(), + ptr = inout(reg) ptr => _, + len = inout(reg) len => _, + out("v0") out, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + f16::from_bits(out) + } + } + unsafe { run(buf) } + }, + #[inline(never)] + fn reduce_two(a: f16, b: f16) -> f16 { + a.max(b) + } +); + +#[cfg(test)] +mod test_arm64fp16_max_f16_32n { + use super::*; + crate::max_frame_tests!(crate::arm64::has_fp16(), f16, arm64fp16_max_f16_32n); +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/panel_extract.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/panel_extract.rs new file mode 100644 index 000000000..a96aec077 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/panel_extract.rs @@ -0,0 +1,94 @@ +use super::FP16; +use crate::block_quant::{PackedBlockQuantFormat, Q4_0}; +use crate::pack::Packing; +use crate::Ops; +use tract_data::internal::*; + +pub fn plug(ops: &mut Ops) { + ops.panel_extractors.push(packed_64_q40_to_f16.clone()); +} + +panel_extractor!(kernel_packed_64_q40_to_f16 as packed_64_q40_to_f16( + Box::new(PackedBlockQuantFormat::new(&Q4_0, 64, 16, true)), + f16::packing(64).align(16) +) where(FP16)); + +#[target_feature(enable = "fp16")] +unsafe fn kernel_packed_64_q40_to_f16(input: *const u8, output: *mut u8, k: usize) { + unsafe { + if k == 0 { + return; + } + let lookup_table: [u8; 16] = [ + 0xc8, 0xc7, 0xc6, 0xc5, 0xc4, 0xc2, 0xc0, 0xbc, 0x00, 0x3c, 0x40, 0x42, 0x44, 0x45, + 0x46, 0x47, + ]; + std::arch::asm!(" + ld1 {{v13.16b}}, [{lookup_table}] + movi v15.16b, 15 + eor v12.16b, v12.16b, v12.16b + + 2: + add {scales}, {i}, 1024 // scales at end: 32 (cols) * 64 (rows) / 2 (half byte) + ld1 {{v16.16b-v19.16b}}, [{scales}], #64 + ld1 {{v20.16b-v23.16b}}, [{scales}] + + mov {k2}, 32 + 3: + ld1 {{ v9.16b-v10.16b }}, [{i}], #32 + + and v0.16b, v9.16b, v15.16b + ushr v2.16b, v9.16b, 4 + + and v4.16b, v10.16b, v15.16b + ushr v6.16b, v10.16b, 4 + + tbl v0.16b, {{ v13.16b }}, v0.16b + tbl v2.16b, {{ v13.16b }}, v2.16b + tbl v4.16b, {{ v13.16b }}, v4.16b + tbl v6.16b, {{ v13.16b }}, v6.16b + + zip2 v1.16b, v12.16b, v0.16b + zip2 v3.16b, v12.16b, v2.16b + zip2 v5.16b, v12.16b, v4.16b + zip2 v7.16b, v12.16b, v6.16b + + zip1 v0.16b, v12.16b, v0.16b + zip1 v2.16b, v12.16b, v2.16b + zip1 v4.16b, v12.16b, v4.16b + zip1 v6.16b, v12.16b, v6.16b + + fmul v0.8h, v0.8h, v16.8h + fmul v1.8h, v1.8h, v17.8h + fmul v2.8h, v2.8h, v18.8h + fmul v3.8h, v3.8h, v19.8h + fmul v4.8h, v4.8h, v20.8h + fmul v5.8h, v5.8h, v21.8h + fmul v6.8h, v6.8h, v22.8h + fmul v7.8h, v7.8h, v23.8h + + st1 {{v0.16b-v3.16b}}, [{o}], #64 + st1 {{v4.16b-v7.16b}}, [{o}], #64 + + subs {k2}, {k2}, #1 + bne 3b + + add {i}, {i}, 128 // skip scales + subs {k}, {k}, 32 + bne 2b + ", + lookup_table = in(reg) &lookup_table, + k = inout(reg) k => _, + k2 = out(reg) _, + scales = out(reg) _, + i = inout(reg) input => _, + o = inout(reg) output => _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + out("v16") _, out("v17") _, out("v18") _, out("v19") _, + out("v20") _, out("v21") _, out("v22") _, out("v23") _, + ); + } +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/sum.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/sum.rs new file mode 100644 index 000000000..e13139c25 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/sum.rs @@ -0,0 +1,62 @@ +use crate::num_traits::Zero; +use tract_data::half::f16; + +reduce_impl_wrap!( + f16, + arm64fp16_sum_f16_32n, + 32, + 8, + (), + f16::zero(), + #[inline(never)] + fn run(buf: &[f16], _: ()) -> f16 { + assert!(buf.len() % 32 == 0); + assert!(buf.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(buf: &[f16]) -> f16 { + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + let mut out: u16; + std::arch::asm!(" + movi v0.8h, #0 + movi v1.8h, #0 + movi v2.8h, #0 + movi v3.8h, #0 + 2: + ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64 + fadd v0.8h, v0.8h, v4.8h + fadd v1.8h, v1.8h, v5.8h + fadd v2.8h, v2.8h, v6.8h + fadd v3.8h, v3.8h, v7.8h + + subs {len}, {len}, 32 + bne 2b + + fadd v0.8h, v0.8h, v1.8h + fadd v2.8h, v2.8h, v3.8h + fadd v0.8h, v0.8h, v2.8h + faddp v0.8h, v0.8h, v0.8h + faddp v0.8h, v0.8h, v0.8h + faddp v0.8h, v0.8h, v0.8h + ", + ptr = inout(reg) ptr => _, + len = inout(reg) len => _, + out("s0") out, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + f16::from_bits(out) + } + } + unsafe { run(buf) } + }, + #[inline(never)] + fn reduce_two(a: f16, b: f16) -> f16 { + a + b + } +); + +#[cfg(test)] +mod test_arm64fp16_sum_f16_32n { + use super::*; + crate::sum_frame_tests!(crate::arm64::has_fp16(), f16, arm64fp16_sum_f16_32n); +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/unicast.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/unicast.rs new file mode 100644 index 000000000..d57ba1062 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/unicast.rs @@ -0,0 +1,271 @@ +use tract_data::half::f16; + +unicast_impl_wrap!( + f16, + arm64fp16_unicast_mul_f16_32n, + 32, + 8, + #[inline(never)] + fn run(a: &mut [f16], b: &[f16]) { + assert!(a.len() == b.len()); + assert!(a.len() % 32 == 0); + assert!(a.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(a: &mut [f16], b: &[f16]) { + unsafe { + let len = a.len(); + let a_ptr = a.as_ptr(); + let b_ptr = b.as_ptr(); + std::arch::asm!(" + 2: + ld1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}] + ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{b_ptr}], 64 + fmul v0.8h, v0.8h, v4.8h + fmul v1.8h, v1.8h, v5.8h + fmul v2.8h, v2.8h, v6.8h + fmul v3.8h, v3.8h, v7.8h + st1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}], 64 + subs {len}, {len}, 32 + bne 2b + ", + len = inout(reg) len => _, + a_ptr = inout(reg) a_ptr => _, + b_ptr = inout(reg) b_ptr => _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _,); + } + } + unsafe { run(a, b) } + } +); + +unicast_impl_wrap!( + f16, + arm64fp16_unicast_add_f16_32n, + 32, + 8, + #[inline(never)] + fn run(a: &mut [f16], b: &[f16]) { + assert!(a.len() == b.len()); + assert!(a.len() % 32 == 0); + assert!(a.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(a: &mut [f16], b: &[f16]) { + unsafe { + let len = a.len(); + let a_ptr = a.as_ptr(); + let b_ptr = b.as_ptr(); + std::arch::asm!(" + 2: + ld1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}] + ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{b_ptr}], 64 + fadd v0.8h, v0.8h, v4.8h + fadd v1.8h, v1.8h, v5.8h + fadd v2.8h, v2.8h, v6.8h + fadd v3.8h, v3.8h, v7.8h + st1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}], 64 + subs {len}, {len}, 32 + bne 2b + ", + len = inout(reg) len => _, + a_ptr = inout(reg) a_ptr => _, + b_ptr = inout(reg) b_ptr => _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _,); + } + } + unsafe { run(a, b) } + } +); + +unicast_impl_wrap!( + f16, + arm64fp16_unicast_sub_f16_32n, + 32, + 8, + #[inline(never)] + fn run(a: &mut [f16], b: &[f16]) { + assert!(a.len() == b.len()); + assert!(a.len() % 32 == 0); + assert!(a.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(a: &mut [f16], b: &[f16]) { + unsafe { + let len = a.len(); + let a_ptr = a.as_ptr(); + let b_ptr = b.as_ptr(); + std::arch::asm!(" + 2: + ld1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}] + ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{b_ptr}], 64 + fsub v0.8h, v0.8h, v4.8h + fsub v1.8h, v1.8h, v5.8h + fsub v2.8h, v2.8h, v6.8h + fsub v3.8h, v3.8h, v7.8h + st1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}], 64 + subs {len}, {len}, 32 + bne 2b + ", + len = inout(reg) len => _, + a_ptr = inout(reg) a_ptr => _, + b_ptr = inout(reg) b_ptr => _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _,); + } + } + unsafe { run(a, b) } + } +); + +unicast_impl_wrap!( + f16, + arm64fp16_unicast_subf_f16_32n, + 32, + 8, + #[inline(never)] + fn run(a: &mut [f16], b: &[f16]) { + assert!(a.len() == b.len()); + assert!(a.len() % 32 == 0); + assert!(a.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(a: &mut [f16], b: &[f16]) { + unsafe { + let len = a.len(); + let a_ptr = a.as_ptr(); + let b_ptr = b.as_ptr(); + std::arch::asm!(" + 2: + ld1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}] + ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{b_ptr}], 64 + fsub v0.8h, v4.8h, v0.8h + fsub v1.8h, v5.8h, v1.8h + fsub v2.8h, v6.8h, v2.8h + fsub v3.8h, v7.8h, v3.8h + st1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}], 64 + subs {len}, {len}, 32 + bne 2b + ", + len = inout(reg) len => _, + a_ptr = inout(reg) a_ptr => _, + b_ptr = inout(reg) b_ptr => _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _,); + } + } + unsafe { run(a, b) } + } +); + +unicast_impl_wrap!( + f16, + arm64fp16_unicast_min_f16_32n, + 32, + 8, + #[inline(never)] + fn run(a: &mut [f16], b: &[f16]) { + assert!(a.len() == b.len()); + assert!(a.len() % 32 == 0); + assert!(a.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(a: &mut [f16], b: &[f16]) { + unsafe { + let len = a.len(); + let a_ptr = a.as_ptr(); + let b_ptr = b.as_ptr(); + std::arch::asm!(" + 2: + ld1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}] + ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{b_ptr}], 64 + fmin v0.8h, v0.8h, v4.8h + fmin v1.8h, v1.8h, v5.8h + fmin v2.8h, v2.8h, v6.8h + fmin v3.8h, v3.8h, v7.8h + st1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}], 64 + subs {len}, {len}, 32 + bne 2b + ", + len = inout(reg) len => _, + a_ptr = inout(reg) a_ptr => _, + b_ptr = inout(reg) b_ptr => _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _,); + } + } + unsafe { run(a, b) } + } +); + +unicast_impl_wrap!( + f16, + arm64fp16_unicast_max_f16_32n, + 32, + 8, + #[inline(never)] + fn run(a: &mut [f16], b: &[f16]) { + assert!(a.len() == b.len()); + assert!(a.len() % 32 == 0); + assert!(a.len() > 0); + #[target_feature(enable = "fp16")] + unsafe fn run(a: &mut [f16], b: &[f16]) { + unsafe { + let len = a.len(); + let a_ptr = a.as_ptr(); + let b_ptr = b.as_ptr(); + std::arch::asm!(" + 2: + ld1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}] + ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{b_ptr}], 64 + fmax v0.8h, v0.8h, v4.8h + fmax v1.8h, v1.8h, v5.8h + fmax v2.8h, v2.8h, v6.8h + fmax v3.8h, v3.8h, v7.8h + st1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}], 64 + subs {len}, {len}, 32 + bne 2b + ", + len = inout(reg) len => _, + a_ptr = inout(reg) a_ptr => _, + b_ptr = inout(reg) b_ptr => _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _,); + } + } + unsafe { run(a, b) } + } +); + +#[cfg(test)] +mod test_arm64fp16_unicast_mul_f16_32n { + use super::*; + use proptest::strategy::Strategy; + crate::unicast_frame_tests!( + crate::arm64::has_fp16(), + f16, + arm64fp16_unicast_mul_f16_32n, + |a, b| a * b + ); + crate::unicast_frame_tests!( + crate::arm64::has_fp16(), + f16, + arm64fp16_unicast_add_f16_32n, + |a, b| a + b + ); + crate::unicast_frame_tests!( + crate::arm64::has_fp16(), + f16, + arm64fp16_unicast_sub_f16_32n, + |a, b| a - b + ); + crate::unicast_frame_tests!( + crate::arm64::has_fp16(), + f16, + arm64fp16_unicast_subf_f16_32n, + |a, b| b - a + ); + crate::unicast_frame_tests!( + crate::arm64::has_fp16(), + f16, + arm64fp16_unicast_min_f16_32n, + |a, b| a.min(b) + ); + crate::unicast_frame_tests!( + crate::arm64::has_fp16(), + f16, + arm64fp16_unicast_max_f16_32n, + |a, b| a.max(b) + ); +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd.rs new file mode 100644 index 000000000..be0c505b9 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd.rs @@ -0,0 +1,117 @@ +mod by_scalar; +mod leaky_relu; +mod max; +mod panel_extract; +mod softmax; +mod sum; +mod unicast; + +pub use by_scalar::*; +pub use leaky_relu::arm64simd_leaky_relu_f32_8n; +pub use max::arm64simd_max_f32_16n; +pub use softmax::arm64simd_softmax2_fastcompact_f32_16n; +pub use sum::arm64simd_sum_f32_16n; +pub use unicast::*; + +use crate::block_quant::{PackedBlockQuantFormat, Q4_0}; +use crate::frame::mmm::ImplementationQuality::ManuallyOptimized; +use crate::pack::PackedFormat; +use crate::Ops; + +use super::Kind; + +fn a55() -> isize { + if *super::KIND == Kind::CortexA55 { + 1 + } else { + -1 + } +} + +fn a53() -> isize { + if *super::KIND == Kind::CortexA53 { + 1 + } else { + -1 + } +} + +MMMExternKernel!(arm64simd_mmm_f32_8x8_a55 (8, 8)@(16, 16) quality(ManuallyOptimized) boost(a55)); +MMMExternKernel!(arm64simd_mmm_f32_12x8_a55(12, 8)@(16, 16) quality(ManuallyOptimized) boost(a55)); +MMMExternKernel!(arm64simd_mmm_f32_16x4_a55(16, 4)@(16, 16) quality(ManuallyOptimized) boost(a55)); +MMMExternKernel!(arm64simd_mmm_f32_24x4_a55(24, 4)@(16, 16) quality(ManuallyOptimized) boost(a55)); +MMMExternKernel!(arm64simd_mmm_f32_64x1_a55(64, 1)@(16, 16) quality(ManuallyOptimized) boost(a55)); + +MMMExternKernel!(arm64simd_mmm_f32_16x4_a53(16, 4)@(16, 16) quality(ManuallyOptimized) boost(a53)); +MMMExternKernel!(arm64simd_mmm_f32_24x4_a53(24, 4)@(16, 16) quality(ManuallyOptimized) boost(a53)); +MMMExternKernel!(arm64simd_mmm_f32_8x8_a53 (8, 8)@(16, 16) quality(ManuallyOptimized) boost(a53)); +MMMExternKernel!(arm64simd_mmm_f32_12x8_a53(12, 8)@(16, 16) quality(ManuallyOptimized) boost(a53)); +MMMExternKernel!(arm64simd_mmm_f32_64x1_a53(64, 1)@(16, 16) quality(ManuallyOptimized) boost(a53)); + +MMMExternKernel!(arm64simd_mmm_f32_16x4_gen(16, 4)@(16, 16) quality(ManuallyOptimized)); +MMMExternKernel!(arm64simd_mmm_f32_24x4_gen(24, 4)@(16, 16) quality(ManuallyOptimized)); +MMMExternKernel!(arm64simd_mmm_f32_8x8_gen (8, 8)@(16, 16) quality(ManuallyOptimized)); +MMMExternKernel!(arm64simd_mmm_f32_12x8_gen(12, 8)@(16, 16) quality(ManuallyOptimized)); +MMMExternKernel!(arm64simd_mmm_f32_64x1_gen(64, 1)@(16, 16) quality(ManuallyOptimized)); + +fn q40p32z16se() -> PackedBlockQuantFormat { + PackedBlockQuantFormat::new(&Q4_0, 32, 16, true) +} + +MMMExternKernel!(arm64simd_mmm_f32_32x1_gen(32, 1)@(16, 16) + packing[1] = q40f16 => |k| k.with_packing(q40p32z16se(), f16::packing(1)); + packing[2] = q40f32 => |k| k.with_packing(q40p32z16se(), f32::packing(1)); + packing[3] = f16f16 => |k| k.with_packing(f16::packing(32), f16::packing(1)); + packing[4] = f32f16 => |k| k.with_packing(f32::packing(32), f16::packing(1)); + packing[5] = f16f32 => |k| k.with_packing(f16::packing(32), f32::packing(1)); + quality(ManuallyOptimized) + store(f16) +); + +MMMExternKernel!(arm64simd_mmm_f32_32x3_gen(32, 3)@(16, 16) + packing[1] = f32f16 => |k| k.with_packing(f32::packing(32), f16::packing(3)); + packing[2] = f16f32 => |k| k.with_packing(f16::packing(32), f32::packing(3)); + packing[3] = f16f16 => |k| k.with_packing(f16::packing(32), f16::packing(3)); + quality(ManuallyOptimized) + store(f16) +); + +MMMExternKernel!(arm64simd_mmm_i32_8x8(8, 8)@(16, 16) + packing[1] = i8i8 => |k| k.with_packing(PackedFormat::new(DatumType::I8, 8, 16), PackedFormat::new(DatumType::I8, 8, 16)); + quality(ManuallyOptimized) + store(i8) +); + +MMMExternKernel!(arm64simd_mmm_i32_64x1(64, 1)@(16, 1) + packing[1] = i8i8 => |k| k.with_packing(PackedFormat::new(DatumType::I8, 64,16), PackedFormat::new(DatumType::I8, 1, 1)); + quality(ManuallyOptimized) + store(i8) +); + +pub fn plug(ops: &mut Ops) { + ops.mmm_impls.extend([ + arm64simd_mmm_f32_12x8_gen.mmm(), + arm64simd_mmm_f32_12x8_a53.mmm(), + arm64simd_mmm_f32_12x8_a55.mmm(), + arm64simd_mmm_f32_8x8_gen.mmm(), + arm64simd_mmm_f32_8x8_a53.mmm(), + arm64simd_mmm_f32_8x8_a55.mmm(), + arm64simd_mmm_f32_16x4_gen.mmm(), + arm64simd_mmm_f32_16x4_a53.mmm(), + arm64simd_mmm_f32_16x4_a55.mmm(), + arm64simd_mmm_f32_24x4_gen.mmm(), + arm64simd_mmm_f32_24x4_a53.mmm(), + arm64simd_mmm_f32_24x4_a55.mmm(), + arm64simd_mmm_f32_32x1_gen.mmm(), + arm64simd_mmm_f32_32x3_gen.mmm(), + arm64simd_mmm_f32_64x1_gen.mmm(), + arm64simd_mmm_f32_64x1_a53.mmm(), + arm64simd_mmm_f32_64x1_a55.mmm(), + arm64simd_mmm_i32_8x8.mmm(), + arm64simd_mmm_i32_64x1.mmm(), + ]); + panel_extract::plug(ops); +} + +tanh_impl!(f32, arm64simd_tanh_f32_4n, 4, 4, true); +sigmoid_impl!(f32, arm64simd_sigmoid_f32_4n, 4, 4, true); diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/by_scalar.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/by_scalar.rs new file mode 100644 index 000000000..49b2c7550 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/by_scalar.rs @@ -0,0 +1,202 @@ +by_scalar_impl_wrap!( + f32, + arm64simd_mul_by_scalar_f32_16n, + 16, + 4, + f32, + fn run(buf: &mut [f32], s: f32) { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + dup v0.4s, v0.s[0] + 2: + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}] + fmul v4.4s, v4.4s, v0.4s + fmul v5.4s, v5.4s, v0.4s + fmul v6.4s, v6.4s, v0.4s + fmul v7.4s, v7.4s, v0.4s + st1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64 + subs {len}, {len}, 16 + bne 2b + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + in("v0") s, + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + } + } +); + +by_scalar_impl_wrap!( + f32, + arm64simd_add_by_scalar_f32_16n, + 16, + 4, + f32, + fn run(buf: &mut [f32], s: f32) { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + dup v0.4s, v0.s[0] + 2: + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}] + fadd v4.4s, v4.4s, v0.4s + fadd v5.4s, v5.4s, v0.4s + fadd v6.4s, v6.4s, v0.4s + fadd v7.4s, v7.4s, v0.4s + st1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64 + subs {len}, {len}, 16 + bne 2b + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + in("v0") s, + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + } + } +); + +by_scalar_impl_wrap!( + f32, + arm64simd_sub_by_scalar_f32_16n, + 16, + 4, + f32, + fn run(buf: &mut [f32], s: f32) { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + dup v0.4s, v0.s[0] + 2: + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}] + fsub v4.4s, v4.4s, v0.4s + fsub v5.4s, v5.4s, v0.4s + fsub v6.4s, v6.4s, v0.4s + fsub v7.4s, v7.4s, v0.4s + st1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64 + subs {len}, {len}, 16 + bne 2b + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + in("v0") s, + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + } + } +); + +by_scalar_impl_wrap!( + f32, + arm64simd_subf_by_scalar_f32_16n, + 16, + 4, + f32, + fn run(buf: &mut [f32], s: f32) { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + dup v0.4s, v0.s[0] + 2: + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}] + fsub v4.4s, v0.4s, v4.4s + fsub v5.4s, v0.4s, v5.4s + fsub v6.4s, v0.4s, v6.4s + fsub v7.4s, v0.4s, v7.4s + st1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64 + subs {len}, {len}, 16 + bne 2b + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + in("v0") s, + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + } + } +); + +by_scalar_impl_wrap!( + f32, + arm64simd_min_by_scalar_f32_16n, + 16, + 4, + f32, + fn run(buf: &mut [f32], s: f32) { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + dup v0.4s, v0.s[0] + 2: + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}] + fmin v4.4s, v4.4s, v0.4s + fmin v5.4s, v5.4s, v0.4s + fmin v6.4s, v6.4s, v0.4s + fmin v7.4s, v7.4s, v0.4s + st1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64 + subs {len}, {len}, 16 + bne 2b + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + in("v0") s, + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + } + } +); + +by_scalar_impl_wrap!( + f32, + arm64simd_max_by_scalar_f32_16n, + 16, + 4, + f32, + fn run(buf: &mut [f32], s: f32) { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + dup v0.4s, v0.s[0] + 2: + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}] + fmax v4.4s, v4.4s, v0.4s + fmax v5.4s, v5.4s, v0.4s + fmax v6.4s, v6.4s, v0.4s + fmax v7.4s, v7.4s, v0.4s + st1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64 + subs {len}, {len}, 16 + bne 2b + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + in("v0") s, + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + } + } +); + +#[cfg(test)] +mod test_arm64simd_mul_by_scalar_f32_16n { + use super::*; + by_scalar_frame_tests!(true, f32, arm64simd_mul_by_scalar_f32_16n, |a, b| a * b); + by_scalar_frame_tests!(true, f32, arm64simd_add_by_scalar_f32_16n, |a, b| a + b); + by_scalar_frame_tests!(true, f32, arm64simd_sub_by_scalar_f32_16n, |a, b| a - b); + by_scalar_frame_tests!(true, f32, arm64simd_subf_by_scalar_f32_16n, |a, b| b - a); + by_scalar_frame_tests!(true, f32, arm64simd_min_by_scalar_f32_16n, |a, b| a.min(b)); + by_scalar_frame_tests!(true, f32, arm64simd_max_by_scalar_f32_16n, |a, b| a.max(b)); +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/leaky_relu.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/leaky_relu.rs new file mode 100644 index 000000000..d71666895 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/leaky_relu.rs @@ -0,0 +1,50 @@ +ew_impl_wrap!( + f32, + arm64simd_leaky_relu_f32_8n, + 8, + 4, + f32, + #[inline(never)] + fn run(buf: &mut [f32], alpha: f32) { + assert!(buf.len() % 8 == 0); + assert!(buf.len() > 0); + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + dup v0.4s, {alpha:v}.s[0] + dup v1.4s, {one:v}.s[0] + 2: + ldp q3, q4, [{ptr}] + + fcmgt v5.4s, v3.4s, #0.0 + fcmgt v6.4s, v4.4s, #0.0 + bsl v5.16b, v1.16b, v0.16b + bsl v6.16b, v1.16b, v0.16b + fmul v3.4s, v3.4s, v5.4s + fmul v4.4s, v4.4s, v6.4s + + stp q3, q4, [{ptr}], #32 + subs {len}, {len}, 8 + bne 2b + ", + one = in(vreg) 1.0f32, + alpha = in(vreg) alpha, + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + out("v0") _, + out("v1") _, + out("q3") _, + out("q4") _, + out("q5") _, + out("q6") _, + ); + } + } +); + +#[cfg(test)] +pub mod test_arm64simd_leaky_relu_f32_8n { + use super::*; + leaky_relu_frame_tests!(true, f32, arm64simd_leaky_relu_f32_8n); +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/max.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/max.rs new file mode 100644 index 000000000..3c32aa7ea --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/max.rs @@ -0,0 +1,52 @@ +use std::arch::aarch64::{float32x4_t, vdupq_n_f32, vgetq_lane_f32}; + +reduce_impl_wrap!( + f32, + arm64simd_max_f32_16n, + 16, + 4, + (), + f32::MIN, + #[inline(never)] + fn run(buf: &[f32], _: ()) -> f32 { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + let mut out: float32x4_t = vdupq_n_f32(f32::MIN); + std::arch::asm!(" + and v1.16b, v0.16b, v0.16b + and v2.16b, v0.16b, v0.16b + and v3.16b, v0.16b, v0.16b + 2: + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64 + fmax v0.4s, v0.4s, v4.4s + fmax v1.4s, v1.4s, v5.4s + fmax v2.4s, v2.4s, v6.4s + fmax v3.4s, v3.4s, v7.4s + subs {len}, {len}, 16 + bne 2b + fmax v0.4s, v0.4s, v1.4s + fmax v2.4s, v2.4s, v3.4s + fmax v0.4s, v0.4s, v2.4s + fmaxv s0, v0.4s + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + inout("v0") out, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + vgetq_lane_f32(out, 0) + } + }, + #[inline(never)] + fn reduce_two(a: f32, b: f32) -> f32 { + a.max(b) + } +); + +#[cfg(test)] +mod test_arm64simd_max_f32_16n { + use super::*; + crate::max_frame_tests!(true, f32, arm64simd_max_f32_16n); +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/panel_extract.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/panel_extract.rs new file mode 100644 index 000000000..bbfb22d9f --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/panel_extract.rs @@ -0,0 +1,98 @@ +use crate::pack::Packing; +use crate::Ops; + +pub fn plug(ops: &mut Ops) { + ops.panel_extractors.push(packed_32_q40_to_f32.clone()); +} + +panel_extractor!(kernel_packed_32_q40_to_f32 as packed_32_q40_to_f32( + Box::new(super::q40p32z16se()), + f32::packing(32).align(16) +)); + +unsafe fn kernel_packed_32_q40_to_f32(input: *const u8, output: *mut u8, k: usize) { + unsafe { + if k == 0 { + return; + } + let lookup_table: [u8; 16] = [ + 0xc8, 0xc7, 0xc6, 0xc5, 0xc4, 0xc2, 0xc0, 0xbc, 0x00, 0x3c, 0x40, 0x42, 0x44, 0x45, + 0x46, 0x47, + ]; + std::arch::asm!(" + ld1 {{v13.16b}}, [{lookup_table}] + movi v15.16b, 15 + eor v12.16b, v12.16b, v12.16b + + 2: + add {scales}, {i}, 512 // scales at end: 32 (cols) * 32 (rows) / 2 (half byte) + ld1 {{v0.8h-v3.8h}}, [{scales}] + + fcvtl v16.4s, v0.4h + fcvtl2 v17.4s, v0.8h + fcvtl v18.4s, v1.4h + fcvtl2 v19.4s, v1.8h + fcvtl v20.4s, v2.4h + fcvtl2 v21.4s, v2.8h + fcvtl v22.4s, v3.4h + fcvtl2 v23.4s, v3.8h + + mov {k2}, 32 + 3: + ld1 {{ v9.16b }}, [{i}], #16 + + and v0.16b, v9.16b, v15.16b + ushr v4.16b, v9.16b, 4 + + tbl v0.16b, {{ v13.16b }}, v0.16b + tbl v4.16b, {{ v13.16b }}, v4.16b + + zip2 v2.16b, v12.16b, v0.16b + zip2 v6.16b, v12.16b, v4.16b + + zip1 v0.16b, v12.16b, v0.16b + zip1 v4.16b, v12.16b, v4.16b + + fcvtl2 v1.4s, v0.8h + fcvtl v0.4s, v0.4h + fcvtl2 v3.4s, v2.8h + fcvtl v2.4s, v2.4h + fcvtl2 v5.4s, v4.8h + fcvtl v4.4s, v4.4h + fcvtl2 v7.4s, v6.8h + fcvtl v6.4s, v6.4h + + fmul v0.4s, v0.4s, v16.4s + fmul v1.4s, v1.4s, v17.4s + fmul v2.4s, v2.4s, v18.4s + fmul v3.4s, v3.4s, v19.4s + fmul v4.4s, v4.4s, v20.4s + fmul v5.4s, v5.4s, v21.4s + fmul v6.4s, v6.4s, v22.4s + fmul v7.4s, v7.4s, v23.4s + + st1 {{v0.16b-v3.16b}}, [{o}], #64 + st1 {{v4.16b-v7.16b}}, [{o}], #64 + + subs {k2}, {k2}, #1 + bne 3b + + add {i}, {i}, 64 // skip scales + subs {k}, {k}, 32 + bne 2b + ", + lookup_table = in(reg) &lookup_table, + k = inout(reg) k => _, + k2 = out(reg) _, + scales = out(reg) _, + i = inout(reg) input => _, + o = inout(reg) output => _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + out("v16") _, out("v17") _, out("v18") _, out("v19") _, + out("v20") _, out("v21") _, out("v22") _, out("v23") _, + ); + } +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/softmax.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/softmax.rs new file mode 100644 index 000000000..a9cfc162d --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/softmax.rs @@ -0,0 +1,110 @@ +map_reduce_impl_wrap!( + f32, + arm64simd_softmax2_fastcompact_f32_16n, + 16, + 4, + f32, + f32::MIN, + 0f32, + #[inline(never)] + fn run(buf: &mut [f32], max: f32) -> f32 { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + let len = buf.len(); + let ptr = buf.as_ptr(); + let mut acc; + const MLN2: f32 = 0.6931471805f32; + const A: f32 = 8388608.0f32; + const B: f32 = 1065353216.0f32; + const C: f32 = 60801.0f32; + const SLOPE: f32 = A / MLN2; + const OFFSET: f32 = B - C; + unsafe { + std::arch::asm!(" + // v0-v3 sum acc + eor v0.16b, v0.16b, v0.16b + eor v1.16b, v1.16b, v1.16b + eor v2.16b, v2.16b, v2.16b + eor v3.16b, v3.16b, v3.16b + + dup v4.4s, v4.s[0] // max + dup v5.4s, v5.s[0] // slope + dup v6.4s, v6.s[0] // offset + eor v7.16b, v7.16b, v7.16b // zero for max + 2: + ld1 {{v8.4s, v9.4s, v10.4s, v11.4s}}, [{ptr}] + + fsub v8.4s, v8.4s, v4.4s + fsub v9.4s, v9.4s, v4.4s + fsub v10.4s, v10.4s, v4.4s + fsub v11.4s, v11.4s, v4.4s + + fmul v8.4s, v8.4s, v5.4s + fmul v9.4s, v9.4s, v5.4s + fmul v10.4s, v10.4s, v5.4s + fmul v11.4s, v11.4s, v5.4s + + fadd v8.4s, v8.4s, v6.4s + fadd v9.4s, v9.4s, v6.4s + fadd v10.4s, v10.4s, v6.4s + fadd v11.4s, v11.4s, v6.4s + + fmax v8.4s, v8.4s, v7.4s + fmax v9.4s, v9.4s, v7.4s + fmax v10.4s, v10.4s, v7.4s + fmax v11.4s, v11.4s, v7.4s + + fcvtnu v8.4s, v8.4s + fcvtnu v9.4s, v9.4s + fcvtnu v10.4s, v10.4s + fcvtnu v11.4s, v11.4s + + fadd v0.4s, v0.4s, v8.4s + fadd v1.4s, v1.4s, v9.4s + fadd v2.4s, v2.4s, v10.4s + fadd v3.4s, v3.4s, v11.4s + + st1 {{v8.4s, v9.4s, v10.4s, v11.4s}}, [{ptr}], 64 + subs {len}, {len}, 16 + bne 2b + + fadd v0.4s, v0.4s, v1.4s + fadd v2.4s, v2.4s, v3.4s + fadd v0.4s, v0.4s, v2.4s + + ext v1.16b, v0.16b, v0.16b, 4 + ext v2.16b, v0.16b, v0.16b, 8 + ext v3.16b, v0.16b, v0.16b, 12 + fadd v0.4s, v0.4s, v1.4s + fadd v2.4s, v2.4s, v3.4s + fadd v0.4s, v0.4s, v2.4s + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + out("v0") acc, + out("v1") _, + out("v2") _, + out("v3") _, + inout("v4") max => _, + inout("v5") SLOPE => _, + inout("v6") OFFSET => _, + out("v7") _, + out("v8") _, + out("v9") _, + out("v10") _, + out("v11") _, + ); + } + acc + }, + #[inline(never)] + fn reduce_two(a: f32, b: f32) -> f32 { + a + b + } +); + +#[cfg(test)] +mod test_arm64simd_softmax2_fastcompact_f32_16n { + use super::*; + crate::softmax_l2_frame_tests!(true, f32, arm64simd_softmax2_fastcompact_f32_16n); +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/sum.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/sum.rs new file mode 100644 index 000000000..87116a8f1 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/sum.rs @@ -0,0 +1,59 @@ +use crate::num_traits::Zero; + +reduce_impl_wrap!( + f32, + arm64simd_sum_f32_16n, + 16, + 4, + (), + f32::zero(), + #[inline(never)] + fn run(buf: &[f32], _: ()) -> f32 { + assert!(buf.len() % 16 == 0); + assert!(buf.len() > 0); + unsafe fn run(buf: &[f32]) -> f32 { + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + let mut out: u32; + std::arch::asm!(" + movi v0.4s, #0 + movi v1.4s, #0 + movi v2.4s, #0 + movi v3.4s, #0 + 2: + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64 + fadd v0.4s, v0.4s, v4.4s + fadd v1.4s, v1.4s, v5.4s + fadd v2.4s, v2.4s, v6.4s + fadd v3.4s, v3.4s, v7.4s + + subs {len}, {len}, 16 + bne 2b + + fadd v0.4s, v0.4s, v1.4s + fadd v2.4s, v2.4s, v3.4s + fadd v0.4s, v0.4s, v2.4s + faddp v0.4s, v0.4s, v0.4s + faddp v0.4s, v0.4s, v0.4s + ", + ptr = inout(reg) ptr => _, + len = inout(reg) len => _, + out("s0") out, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _,); + f32::from_bits(out) + } + } + unsafe { run(buf) } + }, + #[inline(never)] + fn reduce_two(a: f32, b: f32) -> f32 { + a + b + } +); + +#[cfg(test)] +mod test_arm64simd_sum_f32_16n { + use super::*; + crate::sum_frame_tests!(true, f32, arm64simd_sum_f32_16n); +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/unicast.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/unicast.rs new file mode 100644 index 000000000..a7a4d4114 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/unicast.rs @@ -0,0 +1,233 @@ +unicast_impl_wrap!( + f32, + arm64simd_unicast_mul_f32_16n, + 16, + 4, + #[inline(never)] + fn run(a: &mut [f32], b: &[f32]) { + assert!(a.len() == b.len()); + assert!(a.len() % 16 == 0); + assert!(a.len() > 0); + unsafe fn run(a: &mut [f32], b: &[f32]) { + unsafe { + let len = a.len(); + let a_ptr = a.as_ptr(); + let b_ptr = b.as_ptr(); + std::arch::asm!(" + 2: + ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}] + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{b_ptr}], 64 + fmul v0.4s, v0.4s, v4.4s + fmul v1.4s, v1.4s, v5.4s + fmul v2.4s, v2.4s, v6.4s + fmul v3.4s, v3.4s, v7.4s + st1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}], 64 + subs {len}, {len}, 16 + bne 2b + ", + len = inout(reg) len => _, + a_ptr = inout(reg) a_ptr => _, + b_ptr = inout(reg) b_ptr => _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _,); + } + } + unsafe { run(a, b) } + } +); + +unicast_impl_wrap!( + f32, + arm64simd_unicast_add_f32_16n, + 16, + 4, + #[inline(never)] + fn run(a: &mut [f32], b: &[f32]) { + assert!(a.len() == b.len()); + assert!(a.len() % 16 == 0); + assert!(a.len() > 0); + unsafe fn run(a: &mut [f32], b: &[f32]) { + unsafe { + let len = a.len(); + let a_ptr = a.as_ptr(); + let b_ptr = b.as_ptr(); + std::arch::asm!(" + 2: + ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}] + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{b_ptr}], 64 + fadd v0.4s, v0.4s, v4.4s + fadd v1.4s, v1.4s, v5.4s + fadd v2.4s, v2.4s, v6.4s + fadd v3.4s, v3.4s, v7.4s + st1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}], 64 + subs {len}, {len}, 16 + bne 2b + ", + len = inout(reg) len => _, + a_ptr = inout(reg) a_ptr => _, + b_ptr = inout(reg) b_ptr => _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _,); + } + } + unsafe { run(a, b) } + } +); + +unicast_impl_wrap!( + f32, + arm64simd_unicast_sub_f32_16n, + 16, + 4, + #[inline(never)] + fn run(a: &mut [f32], b: &[f32]) { + assert!(a.len() == b.len()); + assert!(a.len() % 16 == 0); + assert!(a.len() > 0); + unsafe fn run(a: &mut [f32], b: &[f32]) { + unsafe { + let len = a.len(); + let a_ptr = a.as_ptr(); + let b_ptr = b.as_ptr(); + std::arch::asm!(" + 2: + ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}] + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{b_ptr}], 64 + fsub v0.4s, v0.4s, v4.4s + fsub v1.4s, v1.4s, v5.4s + fsub v2.4s, v2.4s, v6.4s + fsub v3.4s, v3.4s, v7.4s + st1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}], 64 + subs {len}, {len}, 16 + bne 2b + ", + len = inout(reg) len => _, + a_ptr = inout(reg) a_ptr => _, + b_ptr = inout(reg) b_ptr => _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _,); + } + } + unsafe { run(a, b) } + } +); + +unicast_impl_wrap!( + f32, + arm64simd_unicast_subf_f32_16n, + 16, + 4, + #[inline(never)] + fn run(a: &mut [f32], b: &[f32]) { + assert!(a.len() == b.len()); + assert!(a.len() % 16 == 0); + assert!(a.len() > 0); + unsafe fn run(a: &mut [f32], b: &[f32]) { + unsafe { + let len = a.len(); + let a_ptr = a.as_ptr(); + let b_ptr = b.as_ptr(); + std::arch::asm!(" + 2: + ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}] + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{b_ptr}], 64 + fsub v0.4s, v4.4s, v0.4s + fsub v1.4s, v5.4s, v1.4s + fsub v2.4s, v6.4s, v2.4s + fsub v3.4s, v7.4s, v3.4s + st1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}], 64 + subs {len}, {len}, 16 + bne 2b + ", + len = inout(reg) len => _, + a_ptr = inout(reg) a_ptr => _, + b_ptr = inout(reg) b_ptr => _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _,); + } + } + unsafe { run(a, b) } + } +); + +unicast_impl_wrap!( + f32, + arm64simd_unicast_max_f32_16n, + 16, + 4, + #[inline(never)] + fn run(a: &mut [f32], b: &[f32]) { + assert!(a.len() == b.len()); + assert!(a.len() % 16 == 0); + assert!(a.len() > 0); + unsafe fn run(a: &mut [f32], b: &[f32]) { + unsafe { + let len = a.len(); + let a_ptr = a.as_ptr(); + let b_ptr = b.as_ptr(); + std::arch::asm!(" + 2: + ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}] + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{b_ptr}], 64 + fmax v0.4s, v0.4s, v4.4s + fmax v1.4s, v1.4s, v5.4s + fmax v2.4s, v2.4s, v6.4s + fmax v3.4s, v3.4s, v7.4s + st1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}], 64 + subs {len}, {len}, 16 + bne 2b + ", + len = inout(reg) len => _, + a_ptr = inout(reg) a_ptr => _, + b_ptr = inout(reg) b_ptr => _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _,); + } + } + unsafe { run(a, b) } + } +); + +unicast_impl_wrap!( + f32, + arm64simd_unicast_min_f32_16n, + 16, + 4, + #[inline(never)] + fn run(a: &mut [f32], b: &[f32]) { + assert!(a.len() == b.len()); + assert!(a.len() % 16 == 0); + assert!(a.len() > 0); + unsafe fn run(a: &mut [f32], b: &[f32]) { + unsafe { + let len = a.len(); + let a_ptr = a.as_ptr(); + let b_ptr = b.as_ptr(); + std::arch::asm!(" + 2: + ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}] + ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{b_ptr}], 64 + fmin v0.4s, v0.4s, v4.4s + fmin v1.4s, v1.4s, v5.4s + fmin v2.4s, v2.4s, v6.4s + fmin v3.4s, v3.4s, v7.4s + st1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}], 64 + subs {len}, {len}, 16 + bne 2b + ", + len = inout(reg) len => _, + a_ptr = inout(reg) a_ptr => _, + b_ptr = inout(reg) b_ptr => _, + out("v0") _, out("v1") _, out("v2") _, out("v3") _,); + } + } + unsafe { run(a, b) } + } +); + +#[cfg(test)] +mod test_arm64simd_unicast_mul_f32_16n { + use super::*; + use proptest::strategy::Strategy; + crate::unicast_frame_tests!(true, f32, arm64simd_unicast_mul_f32_16n, |a, b| a * b); + crate::unicast_frame_tests!(true, f32, arm64simd_unicast_add_f32_16n, |a, b| a + b); + crate::unicast_frame_tests!(true, f32, arm64simd_unicast_sub_f32_16n, |a, b| a - b); + crate::unicast_frame_tests!(true, f32, arm64simd_unicast_subf_f32_16n, |a, b| b - a); + crate::unicast_frame_tests!(true, f32, arm64simd_unicast_min_f32_16n, |a, b| a.min(b)); + crate::unicast_frame_tests!(true, f32, arm64simd_unicast_max_f32_16n, |a, b| a.max(b)); +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/cortex_a53.rs b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a53.rs new file mode 100644 index 000000000..5ccf9b689 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a53.rs @@ -0,0 +1,16 @@ +use crate::frame::mmm::CostModel; + pub fn model() -> CostModel<'static> { + CostModel { + big_product_mkn_threshold: 4193280.0, + big_product_kernel_choice: "arm64simd_mmm_f32_12x8_a53", + kernels: &["arm64simd_mmm_f32_12x8_a53", "arm64simd_mmm_f32_12x8_gen", "arm64simd_mmm_f32_16x4_a53", "arm64simd_mmm_f32_16x4_gen", "arm64simd_mmm_f32_24x4_a53", "arm64simd_mmm_f32_24x4_gen", "arm64simd_mmm_f32_8x8_a53", "arm64simd_mmm_f32_8x8_gen", "generic_f32_4x4"], + mrs: &[4, 8, 12, 16, 24], + nrs: &[4, 8], + feat_norm_mean: &[4.592185479105843, 4.595318666792368, 4.579484503710355, 13.76698864960861, 1.5094315895372235, 0.7603118712273642, 3.47170523138833, 0.8752515090543259, 5.487801810865191, 0.9224094567404426, 7.414361167002012, 0.9387575452716298, 11.415367203219317, 0.959758551307847, 1.5074195171026157, 0.750125754527163, 3.47170523138833, 0.875125754527163], + feat_norm_stddev: &[1.2629893666668983, 1.2446322895476982, 1.258916587498509, 1.3105293102858375, 1.1063478713873012, 0.4268931127321023, 2.3025561444671223, 0.330433510637837, 3.431728816936762, 0.2675261685447694, 4.624258056138275, 0.23977451171303063, 6.954988241153163, 0.19652499713600946, 1.1207056563030822, 0.4329400731304941, 2.292868878895526, 0.3305762669799629], + w1: &[-0.6321063041687012, 0.24184978008270264, -0.4356610178947449, -0.1422707587480545, 0.10410869866609573, 0.09415467828512192, 0.1568029671907425, -0.25644537806510925, -0.37143954634666443, 0.15696385502815247, 0.050514884293079376, -0.07972156256437302, -0.253411203622818, 0.27587205171585083, 0.02698700875043869, -0.07245094329118729, -0.013899300247430801, 0.022088056430220604, 0.2630922496318817, -0.06870237737894058, 0.40947580337524414, 0.22110328078269958, 0.03808840364217758, -0.008957616984844208, -0.11127127707004547, 0.07818343490362167, 0.025474127382040024, -0.09513817727565765, 0.10613243281841278, 0.029441041871905327, 0.0819312185049057, -0.03519295156002045, -0.3130439519882202, 0.4705337882041931, 0.4476615786552429, -0.616556704044342, 0.2223544716835022, -0.23584842681884766, -0.3312308192253113, 0.18874213099479675, -0.033394988626241684, 0.09006354957818985, 0.014722823165357113, 0.0877116397023201, 0.07635975629091263, 0.04284617677330971, -0.029695890843868256, -0.05645013228058815, -0.096514992415905, 0.16431200504302979, 0.11922749876976013, -0.08329842984676361, -0.15593503415584564, 0.33497852087020874, 0.5143201947212219, -0.4143742322921753, -0.07121813297271729, 0.032980211079120636, -0.014759342186152935, -0.10575086623430252, -0.08755142986774445, 0.053559254854917526, 0.2959750294685364, -0.210640087723732, -0.09462635219097137, 0.14600691199302673, 0.22388464212417603, -0.185477152466774, -0.100673608481884, -0.10946766287088394, 0.03957876190543175, -0.10485030710697174, 0.01792730763554573, 0.15610192716121674, -0.14726269245147705, 0.30900657176971436, 0.21081387996673584, -0.06592089682817459, 0.03168980032205582, 0.20096036791801453, 0.021350117400288582, -0.04456694424152374, 0.35106319189071655, 0.04561518132686615, -0.14208926260471344, 0.06227286159992218, -0.20092618465423584, 0.08163813501596451, 0.23094142973423004, -0.0332462415099144, 0.26035502552986145, 0.4639679193496704, 0.11891252547502518, 0.4722647964954376, -0.025709064677357674, 0.1651654839515686, -0.009242026135325432, 0.02252785675227642, 0.13325856626033783, -0.32073062658309937, -0.05948975682258606, -0.07114000618457794, -0.04468341916799545, -0.002579547930508852, 0.2056179940700531, -0.14614446461200714, -0.11110267788171768, 0.09043771028518677, 0.135812908411026, -0.3300320506095886, 0.290109783411026, 0.23399846255779266, -0.04882314056158066, -7.729629578534514e-05, 0.04754950851202011, 0.003435821272432804, 0.1115187332034111, -0.08208155632019043, 0.018088344484567642, -0.01600349321961403, -0.025757616385817528, 0.060233402997255325, -0.08445348590612411, 0.375010222196579, 0.7828134298324585, -0.836024820804596, 0.041282471269369125, -0.07747451961040497, 0.31279265880584717, -0.05552798509597778, -0.03274049609899521, -0.1147448793053627, -0.1660863310098648, 0.390122652053833, 0.29283249378204346, -0.0705522671341896, -0.2927100956439972, 0.038575850427150726, -0.15336857736110687, -0.028894517570734024, -0.06372164189815521, 0.2578844130039215, 0.060502175241708755, -0.14235782623291016, 0.6358739137649536, -0.2645033001899719, 0.01847453974187374, 0.3809853792190552, 0.0059107388369739056, -0.07365082949399948, -0.17490413784980774, 0.26099810004234314, 0.38216090202331543, -0.44192376732826233, -0.1497800052165985, 0.11983825266361237, 0.05704215168952942, -0.09331715852022171, -0.027353238314390182, 0.07132093608379364, 0.013686291873455048, -0.14973664283752441, -0.6386663317680359, -0.42794787883758545, 0.43632233142852783, -0.022474655881524086, 0.011099671013653278, 0.08784982562065125, 0.046248968690633774, 0.011553826741874218, 0.0328642763197422, 0.08678832650184631, 0.3153251111507416, -0.15444470942020416, -0.5339609980583191, 0.10007581859827042, -0.02821769379079342, -0.3091129660606384, -0.6009559631347656, -0.555920422077179, 0.9594710469245911, -0.5884919166564941, -0.08316593617200851, 0.07074970006942749, 0.026868166401982307, 0.03690064698457718, -0.2468167096376419, 0.20655325055122375, 0.2654767632484436, -0.11032287031412125, 0.09603621065616608, 0.12746618688106537, 0.11097392439842224, -0.046335164457559586, 0.2753968834877014, -0.4040895402431488, -0.20803606510162354, 0.29299837350845337, -0.21050886809825897, -0.02308674342930317, 0.32019543647766113, -0.010012545622885227, -0.07219666987657547, 0.03816547617316246, -0.03670865297317505, -0.023583250120282173, -0.2030763179063797, 0.4087490737438202, 0.19682352244853973, -0.061049312353134155, -0.34018784761428833, 0.4121433198451996, -0.10742263495922089, -0.2883375287055969, 0.15564028918743134, -0.014489974826574326, -0.40427249670028687, 0.04029366746544838, -0.46333804726600647, -0.5811125636100769, 0.1686166524887085, -0.08247993886470795, 0.02783152647316456, -0.07444962859153748, -0.11033248156309128, 0.17976728081703186, -0.05866902321577072, -0.037863120436668396, 0.016240332275629044, 0.08362828195095062, 0.04285397008061409, -0.2676204442977905, -0.18113869428634644, 0.10164932906627655, 0.5798585414886475, -0.2936221659183502, -0.16815273463726044, 0.3153108060359955, 0.1320323497056961, 0.29474350810050964, -0.31565147638320923, 0.032277628779411316, 0.5137525796890259, 0.13915763795375824, -0.08313784748315811, 0.0871160700917244, 0.07447603344917297, -0.4863177537918091, 0.022499559447169304, 0.07244526594877243, -0.1484450399875641, -0.08256664127111435, 0.09993510693311691, 0.33980417251586914, -0.5465939044952393, -0.18684262037277222, 0.050183601677417755, 0.015223318710923195, -0.32613685727119446, 0.2532300353050232, 0.21044038236141205, -0.24877160787582397, 0.17659279704093933, -0.14793306589126587, 0.054353710263967514, -0.07312241941690445, 0.04128497466444969, -0.0071349963545799255, -0.17010675370693207, 0.3045605719089508, -0.391606867313385, 0.19206605851650238, 0.10403380542993546, -0.3808597922325134, -0.016270365566015244, -0.09313700348138809, 0.11184006929397583, 0.01242944784462452, -0.03349926695227623, -0.1107369139790535, 0.2315940409898758, 0.03170541673898697, -0.48357459902763367, 0.21056240797042847, -0.25072887539863586, 0.3221265375614166, 0.5108669400215149, -0.6159838438034058, -0.5540208220481873, 0.38405123353004456, 0.1323588639497757, -0.11752784997224808, 0.07821227610111237, 0.0494898185133934, 0.28607267141342163, -0.45723024010658264, -0.5914809703826904, -0.15741930902004242, -0.09551641345024109, -0.769051730632782, -0.2119017094373703, -0.8505933284759521, 0.025818098336458206, 0.11196669936180115, 0.013385393656790257, -0.02640729956328869, -0.061663247644901276, -0.012524818070232868, -0.8237857222557068, -0.40553018450737, -0.06807617098093033, -0.07508324831724167, -0.011943532153964043, 0.07591933757066727, 0.18625806272029877, -0.14417743682861328, 0.0031204342376440763, -0.031199704855680466, -0.037418268620967865, -0.062444642186164856, 0.0434197299182415, -0.12462416291236877, -0.256317675113678, -0.0023087849840521812, 0.20042477548122406, 0.17625926434993744, -0.21970611810684204, 0.1626158505678177, -0.09550918638706207, -0.10577445477247238, -0.17239737510681152, 0.28190216422080994, 0.003485368099063635, -0.24596424400806427, 0.5330491662025452, -0.6179713010787964, -0.19186368584632874, 0.04049135372042656, 0.005797799210995436, 0.10468537360429764, -0.03522713482379913, 0.2554764151573181, -0.6601210832595825, 0.3554987609386444, -0.1528356373310089, -0.2578294575214386, -0.01912580616772175, 0.14837700128555298, 0.28032413125038147, 0.6525465250015259, -0.16390740871429443, -0.12456659972667694, -0.04434182122349739, 0.44120529294013977, -0.06832294911146164, 0.4077378511428833, -0.07938709110021591, 0.23457404971122742, -0.05966708064079285, 0.09640492498874664, 0.7555295825004578, -0.3110663592815399, 0.035311225801706314, 0.25391876697540283, 0.09088675677776337, 0.03320888802409172, -0.1745719611644745, 0.2270633578300476, 0.2851920425891876, -0.07204318791627884, -0.05483328923583031, 0.189837247133255, -0.15304607152938843, -0.08311894536018372, -0.06649994850158691, -0.0776129737496376, 0.11864881962537766, -0.06670717149972916, -0.00406235596165061, -0.6984686255455017, 0.28291743993759155, -0.04160117730498314, -0.09169034659862518, 0.14924104511737823, 0.46138641238212585, -0.29699283838272095, -0.6411864757537842, 0.26037612557411194, 0.21487018465995789, -0.20806393027305603, -0.4174681007862091, 0.1901395320892334, 0.049021925777196884, 0.2822348475456238, -0.03862098604440689, 0.029824024066329002, 0.2657202184200287, -0.43108099699020386, 0.37041717767715454, -0.025845345109701157, -0.09200481325387955, -0.017871620133519173, 0.281535267829895, -0.20838744938373566, -0.400356650352478, 0.4133286476135254, -0.08745774626731873, 0.02171195112168789, 0.4766440987586975, -0.24629971385002136, 0.2504408657550812, -0.5850875973701477, -0.49699774384498596, 0.7086884379386902, -0.479250967502594, 0.6140879392623901, 0.0023341099731624126, -0.06628652662038803, -0.0873338133096695, -0.2862805724143982, 0.28077220916748047, 0.030578527599573135, -0.281633198261261, -0.7042887806892395, -0.03409203886985779, 0.3272986114025116, 0.3397904634475708, -0.7069221138954163, 0.09408266842365265, -0.05243761092424393, -0.20503726601600647, 0.15679042041301727, 0.4723545014858246, -0.39158886671066284, 0.17581138014793396, 0.10779093205928802, -0.013951681554317474, 0.052481986582279205, -0.36543500423431396, 0.29497984051704407, 0.4044850766658783, -0.3766767382621765, -0.07298431545495987, 0.9660398364067078, 0.27753373980522156, -0.11616200953722, 0.05277060344815254, -0.05379771068692207, 0.026094499975442886, -0.011136082001030445, -0.13593854010105133, 0.033518679440021515, 0.6947338581085205, 0.6335914134979248, -0.06526267528533936, 0.019844267517328262, 0.10042254626750946, -0.16847042739391327, -0.15717101097106934, -0.7462965250015259, -0.0653005987405777, 0.057602036744356155, 0.010834889486432076, -0.46870648860931396, -0.1872870922088623, 0.3152116537094116, 0.0731910765171051, -0.13902369141578674, 0.10666802525520325, 0.3094567656517029, -0.926356315612793, -0.38388797640800476, -0.02191060781478882, -0.005548040382564068, -0.20935170352458954, 0.24779647588729858, 0.12304577976465225, -0.2883053123950958, 0.019766222685575485, -0.029659172520041466, 0.06051887571811676, -0.01741836965084076, 0.04409812018275261, 0.011840295046567917, -0.14320705831050873, 0.31673386693000793, -0.069312185049057, -0.00935965683311224, 0.019028477370738983, -0.1078404039144516, -0.12472966313362122, 0.10027194768190384, 0.31244829297065735, -0.10855710506439209, -0.3165830969810486, 0.4076120853424072, 0.05742274224758148, 0.17263729870319366, 0.3141464293003082, -0.13655878603458405, 0.07613589614629745, -0.10808823257684708, -0.19837258756160736, 0.16735948622226715, 0.055960867553949356, 0.005388774909079075, -0.30227115750312805, -0.009724846109747887, -0.11610261350870132, 0.05133519321680069, -0.029441826045513153, 0.06810834258794785, -0.13311177492141724, 0.2196519374847412, 0.19138571619987488, -0.2621391713619232, 0.11996466666460037, -0.05961257219314575, 0.1763487011194229, -0.10918399691581726, -0.14629563689231873, 0.5217060446739197, -0.0012722538085654378, 0.08564157783985138, -0.6640400290489197, -0.41702714562416077, 0.045037489384412766, -0.059789709746837616, -0.05092751979827881, 0.10446680337190628, -0.05335049331188202, 0.0846114456653595, 0.04981796815991402, -0.14310699701309204, 0.01863306201994419, -0.0474325567483902, 0.23124581575393677, -0.6166588068008423, -0.7533295154571533, -1.1133880615234375, -0.1241607666015625, -0.5540894865989685, 0.2806711494922638, -0.4259497821331024, -0.07380827516317368, 0.009988346137106419, 0.3110937178134918, 0.0072226757183671, 0.2422133982181549, -0.351376473903656, -0.5103139877319336, 0.5470908284187317, -0.14952707290649414, -0.005531645845621824, -0.24725599586963654, 0.1639375537633896, 0.07172811776399612, -0.1566568911075592, 0.32833099365234375, 0.06875353306531906, -0.17773276567459106, -0.09706790000200272, -0.019849322736263275, 0.1257631778717041, 0.02103520557284355, 0.12721672654151917, 0.012451020069420338, 0.039879027754068375, 0.17779605090618134, -0.09887054562568665, -0.08146625012159348, 0.05893132835626602, 0.18479469418525696, -0.2479601502418518, -0.26928654313087463, 0.3720027506351471, -0.45930227637290955, 0.3673400282859802, 0.016545426100492477, 0.13507097959518433, -0.006458526011556387, 0.036685895174741745, 0.309455007314682, -0.23917894065380096, -0.11758854985237122, 0.2146540731191635, -0.11578961461782455, 0.006646907888352871, -0.04229713976383209, 0.09812270104885101, 0.06730903685092926, 0.28935620188713074, -0.02212020941078663, 0.007341589778661728, -0.1257125288248062, -0.4639318287372589, 0.41743314266204834, 0.40524497628211975, -0.20389464497566223, 0.1286880075931549, 0.05365758389234543, -0.14487741887569427, 0.1511518359184265, 0.11219878494739532, 0.13080842792987823, -0.175934836268425, -0.08939457684755325, 0.16476190090179443, -0.061722587794065475, 0.15382836759090424, 0.15293729305267334, -0.23814627528190613, -0.778872013092041, 0.2813372313976288, 0.20388194918632507, -0.34535032510757446, -0.014981378801167011, 0.1560390293598175, 0.534339189529419, 0.7075706124305725, -0.20866382122039795, 0.050050001591444016, -0.030285198241472244, 0.430580735206604, 0.06858251988887787, 0.32321590185165405, 0.006104054860770702, 0.11919829249382019, -0.09377042204141617, -0.028785547241568565, 0.489607572555542, -0.321664422750473, 0.020770607516169548, 0.5259214639663696, -0.0682888925075531, 0.10569659620523453, -0.18257132172584534, 0.2565872073173523, 0.2177353799343109, 0.029641704633831978, 0.0678875744342804, 0.1679811030626297, -0.04851052165031433, -0.1633165180683136, -0.007416700944304466, -0.06638842821121216, 0.06177712231874466, -0.0709109827876091, -0.11213518679141998, -0.20582593977451324, 0.7092531323432922, 0.43438467383384705, -0.0060964771546423435, -0.12442151457071304, -0.008676152676343918, 0.21390584111213684, -0.014475004747509956, -0.7601429224014282, 0.15622451901435852, -0.3261253833770752, 0.005610095337033272, -0.5111817121505737, -0.003055301494896412, 0.32741662859916687, -0.022710084915161133, -0.24255472421646118, -0.6487520933151245, 0.08797790110111237, 0.2754897177219391, -0.2213398665189743, -0.17206217348575592, 0.1177680641412735, 0.16599608957767487, -0.19922694563865662, -0.07098120450973511, -0.1628963202238083, 0.03356413170695305, -0.24303652346134186, -0.2067747414112091, 0.1192406490445137, -0.020932691171765327, 0.07735628634691238, 0.24762177467346191, -0.3007707893848419, -0.43011191487312317, -0.07597793638706207, 0.2528873085975647, -0.3795652985572815, 0.14651291072368622, 0.07552091032266617, 0.026706784963607788, -0.11118876934051514, 0.0460294634103775, 0.4268769323825836, 0.32645294070243835, -0.09493713080883026, 0.18892213702201843, 0.17980137467384338, 0.06521839648485184, 0.03702569752931595, 0.05443478748202324, -0.030978504568338394, -0.11806164681911469, -0.20229215919971466, 0.6260767579078674, 0.6068219542503357, -0.060956377536058426, 0.05200914293527603, 0.04499080404639244, -0.09300816804170609, 0.0501115508377552, 0.9676806926727295, -0.12394528090953827, 0.17313909530639648, -0.0274575874209404, 1.0245190858840942, -0.24425312876701355, 0.3827340602874756, 0.270155131816864, -0.7169324159622192], + b1: &[-0.518636167049408, 0.7074531316757202, -0.4965735971927643, 0.6063699126243591, -0.3258720934391022, 0.4608336389064789, 0.8324258327484131, -0.6118353605270386, 0.8226121664047241, 0.3534131944179535, -0.43312883377075195, -0.05448569357395172, -0.5826212167739868, 0.8478071689605713, 0.23062080144882202, -0.30911386013031006, -0.5776869058609009, 0.5107449293136597, 0.18762148916721344, 0.2889731228351593, -0.5579098463058472, 0.7818499207496643, 0.7910265922546387, -0.4228874444961548, 0.6197248697280884, -0.4563252627849579, 0.27223169803619385, -0.2859383523464203, -0.4862801730632782, -0.7853735089302063, -0.1534343808889389, -0.5592636466026306, -0.6364999413490295, -0.5210756063461304, 0.3506944477558136, -0.5348182916641235, -0.5098673105239868, 0.45690369606018066, -0.3907462954521179, 0.8493368029594421], + w2: &[-0.525189995765686, 0.44041961431503296, -0.4107511341571808, 0.3741440176963806, -0.02630656771361828, 0.27733951807022095, 0.3907228410243988, -0.05409616604447365, 0.3991526663303375, 0.24264170229434967, -0.657869279384613, -0.3758363425731659, -0.5133534669876099, 0.3480457663536072, 0.5088834166526794, 0.0942729115486145, -0.4167974889278412, 0.4895906448364258, 0.17553496360778809, 0.3702719211578369, -0.5372111201286316, -0.1560969352722168, -0.30670106410980225, -0.48799967765808105, 0.4005548357963562, -0.3075137138366699, 0.656658947467804, -0.4914362132549286, -0.36532747745513916, -0.5505443811416626, 0.1328023225069046, -0.3564044237136841, -0.467242956161499, -0.3465808629989624, 0.4501214027404785, -0.4742763936519623, -0.35285890102386475, 0.46182748675346375, -0.28942185640335083, 0.2825036346912384, -0.1725425124168396, -0.17012473940849304, 0.5306965708732605, -0.34125325083732605, 0.21301832795143127, -0.49370092153549194, -0.06135714799165726, 0.5665233135223389, -0.01510544028133154, -0.0015591675182804465, 0.4308379292488098, 0.09525317698717117, 0.06129995733499527, -0.06124228611588478, -0.28377535939216614, -0.038286369293928146, 0.19221894443035126, -0.45041826367378235, -0.4307488799095154, -0.30516454577445984, 0.3670405447483063, -0.1779327690601349, -0.36808863282203674, 0.344722718000412, -0.2691067159175873, 0.5803861021995544, -0.42112261056900024, 0.1169033870100975, 0.35742461681365967, 0.16161565482616425, 0.44920068979263306, 0.2572435438632965, 0.263318806886673, 0.7236857414245605, -0.2759736180305481, 0.37376394867897034, 0.37350600957870483, -0.4067005515098572, 0.18588955700397491, -0.4281120300292969, 0.4204690456390381, -0.448592871427536, 0.11808016151189804, -0.4660882353782654, 0.33337321877479553, -0.11569353938102722, -0.589764416217804, -0.17854063212871552, -0.44001755118370056, 0.7101057767868042, 0.057653751224279404, 0.3937684893608093, 0.257487416267395, -0.38924211263656616, 0.08511713892221451, 0.10950952023267746, 0.0917661041021347, -0.25429144501686096, 0.6342174410820007, -0.15891794860363007, -0.021509289741516113, 0.535305380821228, 0.28721731901168823, -0.32432296872138977, -0.26846611499786377, 0.07051636278629303, -0.12710770964622498, 0.14568471908569336, 0.6293584704399109, 0.4198862612247467, -0.8883509039878845, 0.5271400809288025, 0.17345309257507324, 0.1771862506866455, -0.214192733168602, 0.17817191779613495, 0.44757506251335144, 0.04112042486667633, 0.6819244027137756, -0.7277362942695618, 0.19224950671195984, -0.2905896008014679, 0.5791959762573242, -0.4898945093154907, 0.47323065996170044, -0.40173205733299255, -0.36294564604759216, 0.6861273050308228, -0.2955973744392395, -0.19740070402622223, 0.4044080674648285, -0.11244003474712372, 0.58234703540802, -0.31175708770751953, -0.3454722762107849, 0.12274620682001114, 0.29693669080734253, -0.41234102845191956, -0.1583351045846939, -0.2763107419013977, 0.34174609184265137, -0.7301539182662964, -0.4137580394744873, 0.5135444402694702, -0.19664454460144043, 0.3913029730319977, -0.47720086574554443, 0.2519521415233612, 0.3860025703907013, 0.4073657691478729, 0.06604084372520447, 0.32879960536956787, 0.4341438114643097, 0.4072171449661255, -0.3755425810813904, 0.29250237345695496, 0.4723772704601288, -0.39177075028419495, 0.3535446524620056, -0.5977760553359985, -0.11535356938838959, -0.8606860637664795, 0.3202466070652008, 0.534551203250885, -0.10786011070013046, 0.5766461491584778, -1.0034655332565308, -0.08353354036808014, 0.20165663957595825, -0.8530645370483398, 0.2801732122898102, -0.2713226079940796, 0.460101842880249, 0.5550602078437805, 0.11862986534833908, -0.8431587219238281, -0.41269758343696594, -0.36862486600875854, 0.08385410159826279, 0.1634000688791275, -0.22930988669395447, -0.39085301756858826, 0.8845512270927429, 0.2522968053817749, 0.3779301643371582, 0.3454946279525757, -0.14984408020973206, 0.2937467098236084, 0.3651972711086273, 1.1317671537399292, -0.4535387456417084, 0.07272656261920929, -0.29987066984176636, -0.03405649587512016, 0.1012202724814415, -0.12492970377206802, -0.048626113682985306, -0.3150321841239929, -0.4124220013618469, -0.7775830030441284, 0.25562793016433716, -0.4026365876197815, 0.27681317925453186, -0.3169574439525604, 0.414761483669281, -0.37095436453819275, -0.2815983295440674, 0.6821384429931641, -0.23631460964679718, -0.391885370016098, 0.32081300020217896, 0.029309673234820366, 0.3151959478855133, -0.23872429132461548, -0.2680605947971344, 0.2245175689458847, 0.28024742007255554, -0.5187304615974426, -0.17155316472053528, -0.18662460148334503, 0.44196388125419617, -0.7731465697288513, -0.39956656098365784, 0.4926709830760956, -0.2705640196800232, 0.5851831436157227, -0.28655296564102173, 0.21914565563201904, 0.42291808128356934, 0.3754308521747589, 0.12476411461830139, 0.4564429223537445, 0.41455739736557007, 0.24721866846084595, -0.39062193036079407, 0.47335484623908997, 0.4390261769294739, -0.2776612639427185, 0.36352279782295227, -0.4658246338367462, 0.5458199977874756, 0.2368425875902176, -0.28375834226608276, -0.21349868178367615, -0.12575705349445343, -0.314109742641449, 0.2133757472038269, -0.4604170322418213, -0.5457999110221863, 0.347943514585495, 0.3864844739437103, 0.2128392457962036, 0.06274894624948502, -0.5941122174263, -0.4954967200756073, 0.3897503614425659, 0.6681548953056335, 0.011607992462813854, -0.5754616260528564, -0.4551040530204773, 0.14332124590873718, 0.5475043058395386, 0.35485684871673584, 0.516143798828125, -0.43508225679397583, -0.2927212119102478, -0.38220953941345215, 0.22585861384868622, -0.49666696786880493, -0.47814127802848816, 0.6455125212669373, -0.4184291362762451, 0.5714888572692871, -0.06349734216928482, -0.337534636259079, 0.08359762281179428, -0.6663680672645569, -0.05490731820464134, 0.27789443731307983, 0.44944822788238525, -0.12919825315475464, -0.24064187705516815, 0.3863179683685303, -0.21315856277942657, -0.010893935337662697, -0.49465489387512207, -0.1953386515378952, 0.4405977129936218, -0.362499862909317, -0.15224213898181915, 0.503758430480957, 0.13674911856651306, 0.24574719369411469, -0.2888658046722412, -0.5966756939888, 0.24279867112636566, 0.43060633540153503, -0.2950061857700348, -0.3071616590023041, -0.31878525018692017, 0.5719135999679565, -0.46542906761169434, -0.33102989196777344, 0.2584391236305237, -0.3341030776500702, 0.35185420513153076, -0.5347702503204346, 0.2021929919719696, 0.3747906982898712, 0.3017856478691101, 0.4192887842655182, 0.2290816456079483, 0.26369208097457886, 0.30613088607788086, -0.2766033113002777, 0.48649486899375916, 0.28767234086990356, -0.31826111674308777, 0.47518086433410645, -0.2643313407897949, 0.38674306869506836, -0.20252466201782227, 0.2426745593547821, -0.2963939607143402, 0.35027387738227844, -0.40756842494010925, -0.17158618569374084, 0.6504075527191162, -0.23639068007469177, -0.5520732998847961, 0.34597641229629517, 0.12782879173755646, 0.46479496359825134, -0.4128115773200989, -0.4125882685184479, 0.20131008327007294, 0.4997844099998474, -0.21766024827957153, -0.2570849657058716, -0.1471637338399887, 0.5070111155509949, -0.6722937226295471, -0.5443961024284363, 0.5341878533363342, -0.29976886510849, 0.6135430932044983, -0.3595261573791504, 0.49033448100090027, 0.3653552234172821, 0.2656362056732178, 0.10900922119617462, 0.4813465476036072, 0.41922783851623535, 0.2692069411277771, -0.4056242108345032, 0.33006641268730164, 0.27100467681884766, -0.5306692123413086, 0.2701503336429596, -0.6044796705245972], + b2: &[0.044342152774333954, -0.28361865878105164, -0.0350283607840538, -0.129508376121521, -0.006770995445549488, -0.24053514003753662, 0.3617520332336426, -0.3381704092025757, -0.24953331053256989], + } + } diff --git a/vendor/tract-linalg-0.22.1/src/arm64/cortex_a55.rs b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a55.rs new file mode 100644 index 000000000..8156850af --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a55.rs @@ -0,0 +1,16 @@ +use crate::frame::mmm::CostModel; + pub fn model() -> CostModel<'static> { + CostModel { + big_product_mkn_threshold: 263214080.0, + big_product_kernel_choice: "arm64simd_mmm_f32_12x8_a55", + kernels: &["arm64simd_mmm_f32_12x8_a53", "arm64simd_mmm_f32_12x8_a55", "arm64simd_mmm_f32_12x8_gen", "arm64simd_mmm_f32_16x4_a53", "arm64simd_mmm_f32_16x4_a55", "arm64simd_mmm_f32_16x4_gen", "arm64simd_mmm_f32_24x4_a53", "arm64simd_mmm_f32_24x4_a55", "arm64simd_mmm_f32_24x4_gen", "arm64simd_mmm_f32_8x8_a53", "arm64simd_mmm_f32_8x8_a55", "arm64simd_mmm_f32_8x8_gen", "generic_f32_4x4"], + mrs: &[4, 8, 12, 16, 24], + nrs: &[4, 8], + feat_norm_mean: &[5.27886946965165, 6.250454700699139, 5.241114620514529, 16.770438790865423, 1.540625, 0.770625, 3.518125, 0.8775, 5.560625, 0.923125, 7.453125, 0.943125, 11.613125, 0.9575, 1.509375, 0.771875, 3.581875, 0.898125], + feat_norm_stddev: &[0.9509890252368828, 0.6930410342704738, 1.0261938261805659, 1.600617293156687, 1.0981118382819681, 0.42043086158725473, 2.338198341538852, 0.3278623949159141, 3.494112850120205, 0.26639300736881577, 4.56457037785321, 0.2316036147710134, 7.043415558830455, 0.20172691937369452, 1.0925484471523377, 0.4196236222795381, 2.273113830052299, 0.30248385804039707], + w1: &[-0.13682155311107635, -0.1783919334411621, 0.26539096236228943, 0.19552235305309296, -0.10618806630373001, 0.13501706719398499, 0.21776071190834045, 0.08390733599662781, -0.2215081751346588, 0.18829140067100525, -0.20535176992416382, -0.0463368222117424, 0.05815611779689789, -0.13855215907096863, -0.024539709091186523, -0.4855460524559021, -0.414151668548584, -0.7574286460876465, 0.8987273573875427, 0.5316352844238281, 0.8244147896766663, 0.8388808369636536, -0.02545193023979664, 0.04357631504535675, -0.007071307860314846, 0.18223997950553894, -0.04292978346347809, 0.004330582916736603, -0.013073648326098919, -0.04028080403804779, -0.09901119023561478, 0.062175191938877106, -0.006247916258871555, 0.009531030431389809, 0.09731218218803406, 0.004297865089029074, 0.6260067224502563, -0.10042139887809753, 0.807989239692688, 0.6866835951805115, -0.018399836495518684, -0.07194910198450089, -0.18889868259429932, -0.07729395478963852, -0.03907148540019989, 0.017019111663103104, 0.06159460172057152, -0.02395886555314064, 0.23730705678462982, -0.15546496212482452, -0.04492897167801857, -0.003982013091444969, -0.09160511195659637, 0.03185845538973808, -0.27577653527259827, 0.5699247121810913, -0.6027079820632935, -0.4136800467967987, -0.04364140331745148, -0.11226192861795425, 0.16899903118610382, -0.11524038016796112, 0.12308179587125778, 0.027925828471779823, -0.06269104778766632, 0.11644940823316574, -0.369202196598053, 0.3338239789009094, -0.06509242206811905, 0.2303273230791092, 0.018171854317188263, -0.08709719777107239, 0.18228614330291748, -0.071574367582798, -0.012407014146447182, -0.284942090511322, -0.1326635330915451, -0.08634418249130249, 0.11018547415733337, -0.09423547983169556, 0.13697068393230438, -0.03515861555933952, 0.014629656448960304, -0.21159854531288147, 0.15693463385105133, -0.021487032994627953, 0.032396819442510605, 0.028369005769491196, 0.08724819868803024, -0.13204769790172577, 0.4691336452960968, 0.1237262561917305, -0.06020978465676308, 0.24037614464759827, 0.05237792432308197, -0.10641840100288391, 0.1820996105670929, 0.6079273819923401, -0.4903985857963562, 0.40744978189468384, 0.43370547890663147, 0.5092437863349915, 0.0810965895652771, 0.4670366048812866, -0.11692337691783905, -0.013550599105656147, -0.364605575799942, 0.34470170736312866, -0.01755279302597046, 0.30621764063835144, 0.35784396529197693, 0.42736300826072693, 0.022546129301190376, 0.08497388660907745, 0.07601173967123032, 0.0696730837225914, -0.21918217837810516, 0.6236687898635864, -0.0793512761592865, -0.0668395534157753, 0.010559543035924435, 0.46621084213256836, -0.2632196843624115, -0.03991322219371796, 0.1392994076013565, 0.003188274335116148, -0.2655166983604431, -0.22143644094467163, -0.19157607853412628, -0.39395904541015625, -0.021266555413603783, 0.08848410844802856, 0.08152330666780472, 0.013220606371760368, -0.18424198031425476, 0.05234640836715698, 0.05919161066412926, -0.16255362331867218, -0.04549096152186394, 0.044437166303396225, 0.21704396605491638, -0.5149197578430176, -0.6047705411911011, -0.8048356175422668, -0.36901935935020447, -0.19035962224006653, 1.3252514600753784, 0.19824109971523285, 0.07630860805511475, -0.011165079660713673, 0.011559495702385902, 0.10554458945989609, -0.12820255756378174, 0.29352235794067383, -0.06662449240684509, -0.15792854130268097, 0.0345480814576149, -0.04881494492292404, -0.06912268698215485, -0.00013739474525209516, -0.1597173660993576, 0.34323570132255554, 0.34446775913238525, 0.3795395493507385, 0.017453864216804504, 0.1102253794670105, 0.04026523232460022, 0.1107630804181099, 0.10295291990041733, 0.5326219201087952, 0.1749747395515442, -0.2661803066730499, 0.11752097308635712, 0.08010037988424301, -0.5501991510391235, -0.059987347573041916, -0.04125819355249405, 0.16356444358825684, 0.020170046016573906, -0.08306766301393509, 0.17777052521705627, 0.4687126874923706, 0.7723219394683838, 0.7309747934341431, -0.019829215481877327, 0.10945341736078262, 0.06796073168516159, -0.12042505294084549, -0.26762208342552185, 0.10846878588199615, 0.013867417350411415, 0.01077105849981308, -0.10193657130002975, -0.1757654845714569, -0.245382159948349, 0.20442898571491241, 0.10115985572338104, 0.2514199912548065, -0.3793720304965973, -0.6926521062850952, -0.6686761975288391, -0.607191264629364, 0.16187654435634613, -0.0073340232484042645, -0.09948350489139557, -0.21431320905685425, -0.12334707379341125, -0.15290899574756622, -0.026063116267323494, 0.26553207635879517, 0.18921764194965363, -0.1665697544813156, -0.00264778733253479, 0.20274107158184052, 0.3660823404788971, -0.35731416940689087, 0.50246661901474, 0.2781502604484558, 0.1629776805639267, -0.03493829071521759, -0.16012291610240936, -0.08139592409133911, -0.1440155804157257, 0.32721832394599915, 0.1312151998281479, 0.17874418199062347, 0.06143738701939583, -0.05158458650112152, 0.4802771806716919, -0.6857288479804993, 0.08245638012886047, -0.09577414393424988, -0.12872998416423798, 0.16155612468719482, 0.24089869856834412, 0.44030725955963135, -0.30994167923927307, 0.12139064073562622, 0.029418930411338806, -0.051672156900167465, -0.10080718994140625, -0.007311842869967222, -0.15189751982688904, -0.1559375822544098, 0.2731820344924927, -0.03627878054976463, 0.10538394004106522, 0.15048423409461975, 0.12981411814689636, 0.0002639668236952275, 0.05666665732860565, 0.08173252642154694, -0.16131722927093506, -0.043261025100946426, -0.14845971763134003, -0.29335740208625793, 0.039398159831762314, -0.02791670151054859, 0.22897064685821533, -0.12178067117929459, -0.4062419831752777, 0.3934949040412903, -0.05093907564878464, -0.06126153841614723, -0.07318481802940369, -0.08793392032384872, -0.01818496361374855, -0.24753189086914062, -0.30580347776412964, 0.44876909255981445, 0.5379880666732788, 0.11587893962860107, 0.2174995243549347, -0.035063862800598145, -0.0010147193679586053, -0.12281838059425354, -0.21301835775375366, 0.3645245432853699, 0.39920729398727417, -0.45564430952072144, 0.03503882512450218, 0.6949061155319214, -0.5742982625961304, 0.38680514693260193, -0.018345845863223076, 0.04529440030455589, -0.04468340799212456, -0.020917288959026337, 0.2523670792579651, -0.4574699103832245, 0.17178472876548767, -0.12147565186023712, 0.043810319155454636, -0.17998050153255463, -0.09663069248199463, -0.03498067706823349, 0.06111514940857887, -0.11410824209451675, 0.18208050727844238, -0.09109053015708923, 0.08489643037319183, 0.15014725923538208, 0.18506401777267456, -0.060843177139759064, -0.11932594329118729, 0.11290943622589111, -0.23226700723171234, -0.2114422470331192, -0.36001038551330566, -0.29864072799682617, -0.05599717050790787, -0.21294310688972473, -0.1301364004611969, -0.4993196725845337, 0.097460076212883, 0.030209479853510857, 0.35134217143058777, -0.9156147837638855, 0.0173207875341177, -0.9142565131187439, 0.13512593507766724, -0.1926516443490982, -0.2812888026237488, 0.04805266484618187, 0.5790673494338989, -0.28300249576568604, -0.10372477024793625, 0.2964925169944763, 0.16425621509552002, -0.25588271021842957, 0.37744808197021484, -0.07827199995517731, -0.7785226702690125, -0.4873232841491699, -0.0240982286632061, -0.31732890009880066, -0.7271391749382019, -0.40648236870765686, -0.08706668019294739, -0.0876365602016449, -0.08107846975326538, 0.049622420221567154, 0.5049374103546143, -0.09109669923782349, -0.2958216369152069, 0.23400314152240753, 0.0727144181728363, -0.06163109838962555, -0.3235352635383606, -0.08323507010936737, 0.06926267594099045, 0.12505480647087097, 0.06806384027004242, -0.1783592253923416, -0.09036792814731598, 0.007250780239701271, 0.07478834688663483, 0.37752634286880493, 0.10522382706403732, -0.3126020133495331, -0.339804470539093, -0.2922729253768921, -0.04612985998392105, 0.06431944668292999, 0.08483731746673584, 0.12883307039737701, -0.015924949198961258, 0.10468991845846176, -0.3394957184791565, 0.23376204073429108, -0.22720825672149658, 0.005506275221705437, -0.22926953434944153, -0.10148110240697861, 0.06526672840118408, -0.2586720287799835, -0.32853958010673523, 0.3440588712692261, -0.11197478324174881, -0.24647162854671478, 0.32472386956214905, 0.18955329060554504, 0.22783295810222626, 0.27004650235176086, 0.06792190670967102, -0.25404539704322815, -0.0421239472925663, 0.19141103327274323, -0.1919824779033661, 0.024490466341376305, -0.45774775743484497, 0.15080632269382477, -0.21607035398483276, -0.15506379306316376, -0.4421549439430237, -0.3747740089893341, -0.40712970495224, -0.01002188865095377, -0.18514835834503174, -0.052659012377262115, -0.009491002187132835, -0.04560127854347229, 0.5816720724105835, -0.8684999942779541, -0.6074734330177307, -0.6023196578025818, 0.09026342630386353, -0.8521136045455933, -0.677777886390686, -0.7927519083023071, 0.05012498050928116, 0.006620208732783794, 0.09600439667701721, 0.006934305187314749, -0.41822823882102966, 0.5416979193687439, 1.3451576232910156, 0.6131516098976135, -0.1447380781173706, 0.09429032355546951, 0.06888633966445923, 0.09988542646169662, -0.09572823345661163, 0.09141702950000763, 0.05828794091939926, -0.20784544944763184, -0.14200495183467865, 0.014049896970391273, -0.081334687769413, 0.15918458998203278, 0.001768372836522758, 0.009856577031314373, 0.5256384611129761, 0.49961280822753906, 0.5969673991203308, 0.37020817399024963, -0.07463415712118149, -0.0038648881018161774, 0.014317997731268406, 0.07256675511598587, 0.27220791578292847, -0.14287996292114258, -0.18170645833015442, -0.021593274548649788, -0.15909305214881897, 0.3259168863296509, -0.11064229905605316, 0.12034989148378372, 0.36166661977767944, -0.21680544316768646, -0.14505243301391602, -0.24518895149230957, -0.054052721709012985, 0.11477477848529816, 0.10946492105722427, -0.004644579254090786, -0.11873581260442734, 0.00934956781566143, 0.026955196633934975, -0.0947655513882637, -0.0432097427546978, 0.2264525443315506, 0.4585563540458679, -0.2117093950510025, 0.06864829361438751, 0.01817937195301056, -0.09130346775054932, -0.031736359000205994, -0.6623827219009399, 0.07924489676952362, 0.30316102504730225, 0.06474705785512924, 0.12052184343338013, -0.06878554821014404, 0.048135798424482346, 0.14442582428455353, -0.1945008486509323, 0.16308918595314026, 0.13180820643901825, -0.3005691170692444, -0.08318639546632767, -0.0371159091591835, -0.036223117262125015, 0.27411049604415894, -0.008904200047254562, -0.21584218740463257, -0.22458405792713165, -0.2840893864631653, 0.9380438327789307, -0.026274412870407104, -0.03674294427037239, -0.039288733154535294, 0.20259428024291992, -0.2627299726009369, -0.03588804602622986, -0.09061996638774872, 0.0026293552946299314, -1.1599351167678833, -0.0888570249080658, 0.3020864427089691, 0.10419020056724548, -0.2301473766565323, -0.2372182309627533, 0.255910724401474, -0.9108321666717529, -0.17266617715358734, -0.21715109050273895, -0.4768790900707245, 0.02349638193845749, 0.06996935606002808, 0.2306048572063446, -0.2647320032119751, -0.5029106140136719, 0.18124276399612427, 0.05404527485370636, -0.556660532951355, -0.20282964408397675, 0.1787903904914856, -0.13809867203235626, 0.012665750458836555, -0.007909105159342289, -0.11666542291641235, 0.192016139626503, 0.20280246436595917, 0.04091315343976021, 0.21129484474658966, 0.06015581637620926, -0.1396055370569229, 0.11048803478479385, -0.22130873799324036, 0.10175041109323502, 0.15478093922138214, -0.06699641793966293, 0.16655825078487396, -0.5767931938171387, 0.23376262187957764, -0.06561370939016342, 0.08572515100240707, 0.22690269351005554, -0.10714394599199295, 0.2328615039587021, 0.06609856337308884, 0.15064586699008942, 0.1398843675851822, 9.159173350781202e-05, -0.006412057671695948, 0.1231503039598465, 0.2868848741054535, -0.37850138545036316, -0.4390513002872467, -0.10716433078050613, -0.16492293775081635, -0.17774488031864166, -0.006263014394789934, -0.15535981953144073, -0.15121980011463165, -0.022719506174325943, -0.3260766863822937, 0.1365034133195877, 0.7772430777549744, 0.8306354880332947, 0.8039601445198059, 0.16534824669361115, -0.03939266875386238, -0.15611104667186737, 0.21217003464698792, -0.022034769877791405, -0.025939559563994408, 0.1058378517627716, -0.08505864441394806, 0.08503950387239456, -0.0037705348804593086, -0.0026697057764977217, 0.3492349088191986, 0.15157155692577362, -0.3159380555152893, -0.10824967920780182, -0.04872310906648636, 0.19715555012226105, -0.2658633291721344, -0.06968845427036285, 0.009916169568896294, 0.18593478202819824, -0.038871243596076965, -0.3416462540626526, 0.1855567842721939, 0.21629339456558228, -0.10832708328962326, -0.04190235957503319, 0.2388715296983719, -0.11624565720558167, -0.10361404716968536, 0.0536813959479332, 0.12528158724308014, -0.262010782957077, 0.05081893876194954, 0.29551735520362854, 0.05958620831370354, -0.01989975944161415, -0.19261345267295837, 0.01736867055296898, -0.07923264801502228, -0.4404444694519043, 0.3125889301300049, 0.10095971822738647, 0.17173698544502258, 0.23782190680503845, -0.07170403748750687, 0.013639729470014572, 0.19007621705532074, 0.1901141107082367, -0.052342064678668976, -0.9643150568008423, -0.12307217717170715, -0.21010802686214447, -0.5640560984611511, 0.010125457309186459, 0.1314179003238678, 0.10721258819103241, -0.24371789395809174, -0.5925355553627014, 0.49424877762794495, -0.03528435528278351, -0.21386614441871643, 1.4134130477905273, -0.2751445770263672, 0.007012579124420881, -0.023824317380785942, 0.004113825503736734, -0.06332013010978699, 0.286077082157135, 0.04896686226129532, 0.31404414772987366, 0.15028351545333862, 0.003490754636004567, 0.0802399218082428, -0.230818971991539, -0.022719932720065117, 0.26083019375801086, -0.2885863184928894, 0.07537354528903961, 0.12282905727624893, -0.38638314604759216, 0.1752759963274002, -0.07370781153440475, 0.13994526863098145, 0.13313405215740204, 0.2851952016353607, 0.905279278755188, 0.34521353244781494, -0.36453402042388916, 0.46360254287719727, -0.002040385501459241, -0.003476516343653202, -0.19058215618133545, 0.27096763253211975, 0.08722586184740067, 0.03202880546450615, -0.06164764240384102, 0.011678489856421947, 0.21189850568771362, -0.40100231766700745, 0.022941868752241135, 0.0394427627325058, 0.0675845518708229, -0.22503064572811127, 0.14730903506278992, 0.24842065572738647, -0.34360530972480774, 0.21811245381832123, -0.05238509923219681, -0.008763357996940613, -0.1336073875427246, 0.15671105682849884, 0.4475333094596863, -0.5187726616859436, 0.005388418212532997, -0.07889139652252197, 0.10729073733091354, 0.22381159663200378, 0.07434546202421188, -0.0843898206949234, 0.13574494421482086, 0.01853088103234768, -0.41072791814804077, 0.40448933839797974, -0.8231801986694336, -0.4780847728252411, -0.11237931996583939, 0.012673617340624332, -0.04672158136963844, -0.23933981359004974, 0.01667654886841774, -0.14681674540042877, 0.077765092253685, 0.15309257805347443, 0.03254099190235138, -0.015896232798695564, -0.029608771204948425, -0.288953959941864, -0.32651081681251526, 0.06307528167963028, -0.09873636066913605, 0.08938323706388474, 0.27018269896507263, 0.018129458650946617, -0.050469521433115005, -0.17951229214668274, 0.02319747768342495, 0.06737810373306274, 0.2690926194190979, -0.10778623819351196, -0.04740763455629349, 0.30407941341400146, -0.08746829628944397, -0.2184152454137802, 0.14826175570487976, 0.18092381954193115, 0.07989493757486343, -0.1297195851802826], + b1: &[-0.5191351175308228, 0.6662623882293701, 0.610133707523346, -1.1585999727249146, 0.6903770565986633, 0.4241520166397095, 0.754120945930481, -0.7599878907203674, -0.3445088267326355, 0.9317805767059326, -0.2041703462600708, 0.17219330370426178, 1.1566059589385986, -0.41121166944503784, -0.6977726817131042, 0.7911778092384338, 0.6611397862434387, -0.6938921213150024, -0.03742314130067825, -0.16022440791130066, 0.11257349699735641, 0.07743008434772491, -0.6286312937736511, 0.544836699962616, -0.15634237229824066, -0.5572881698608398, 0.9681645035743713, -0.7440500855445862, 0.10288882255554199, 0.9043763875961304, 0.14654643833637238, -0.024421239271759987, -0.4609592854976654, 0.917902410030365, 0.2704138457775116, 0.6341348886489868, 0.034945350140333176, 0.5565919876098633, 0.1746397614479065, -0.6341800093650818], + w2: &[0.07229708135128021, 0.2507615387439728, 0.16330942511558533, 0.5204483866691589, 0.24313874542713165, -0.5474504232406616, -0.28332123160362244, -0.2225571572780609, -0.1043124571442604, 0.06595291197299957, 0.21239061653614044, -0.14725270867347717, -0.8134568333625793, 0.07381946593523026, -0.24956485629081726, 0.4919748604297638, 0.2962062954902649, 0.3260444402694702, 0.07504145801067352, -0.053836897015571594, 0.2531750500202179, -0.04855559393763542, -0.5578967332839966, -0.5225025415420532, 0.055111128836870193, -0.21510563790798187, 0.5871708989143372, -0.19132649898529053, 0.007392226252704859, -0.298953115940094, 0.16707110404968262, -0.04706822335720062, 0.07302752882242203, -0.08172990381717682, 0.23955324292182922, -0.15824700891971588, -0.3977665305137634, 0.5267415642738342, -0.11258449405431747, -0.3343915045261383, 0.23245088756084442, -0.7491211891174316, -0.6333310604095459, 0.0232061930000782, -0.2315434217453003, -0.3745144307613373, -0.03209906071424484, -0.4041699469089508, 0.041345734149217606, 0.19181972742080688, -0.2760458290576935, -0.07779327034950256, 0.24569696187973022, -0.18802686035633087, -0.6544056534767151, 0.556419849395752, 0.11468080431222916, -0.32528090476989746, 0.38538315892219543, 0.33702555298805237, -0.442532479763031, 0.00750756124034524, -0.45737770199775696, -0.06860284507274628, -0.4411284625530243, -0.23914210498332977, 0.06834587454795837, 0.14571186900138855, 0.6887655258178711, 0.5702284574508667, 0.3135473430156708, -0.3360161781311035, -0.5353860259056091, 0.06292688101530075, 0.735708475112915, 0.7143703103065491, -0.3693147897720337, 0.525284469127655, 0.39448651671409607, -0.09941494464874268, 0.09564384818077087, 0.5881519913673401, 0.05619557946920395, 0.4508857727050781, -0.2834583520889282, -0.16902177035808563, 0.24799591302871704, -0.182522252202034, 0.0468696765601635, 0.14808374643325806, -0.013205822557210922, -0.12705814838409424, 0.0614711195230484, 0.14103399217128754, -0.2599405348300934, 0.028414186090230942, -0.2865449786186218, -0.08163938671350479, 0.13120926916599274, 0.17990124225616455, -0.16350798308849335, -0.09809352457523346, -0.013590727932751179, -0.17736633121967316, 0.05107983574271202, 0.3411618173122406, -0.2772451341152191, 0.32397109270095825, 0.046551186591386795, 0.13246433436870575, 0.05053735896945, 0.24057962000370026, -0.04693610221147537, -0.1650579869747162, 0.1331019252538681, 0.09457181394100189, -0.16547952592372894, -0.09469929337501526, 0.30049434304237366, 0.12664170563220978, -0.013082812540233135, 0.390655517578125, 0.6400918364524841, -0.0010483618825674057, -0.03533017635345459, 0.16345657408237457, 0.05697643384337425, 0.1748565286397934, 0.0036667422391474247, -0.05557025969028473, 0.016822226345539093, -0.12541711330413818, -0.4695605933666229, 0.008447905071079731, 0.16371716558933258, -0.1481284201145172, -0.10916673392057419, 0.1754710078239441, -0.05557332932949066, 0.17406205832958221, 0.03734235838055611, -0.0014076621737331152, 0.16409075260162354, -0.0339696928858757, 0.11525241285562515, 0.11995170265436172, -0.39020177721977234, 0.01936984248459339, -0.14390763640403748, -0.18344464898109436, -0.08675119280815125, 0.19569827616214752, 0.48439380526542664, -0.232485830783844, -0.004231136757880449, 0.15202505886554718, 0.01103641465306282, -0.1192987710237503, -0.17487019300460815, 0.27336806058883667, -0.5894135236740112, -0.03331466019153595, 0.21942859888076782, 0.30420297384262085, 0.2666693329811096, 0.4481956958770752, -0.020630693063139915, 0.8494743704795837, 0.5691520571708679, 0.5711295008659363, 0.00404204148799181, 0.5070351958274841, 0.09074786305427551, 0.15874768793582916, 0.7676622271537781, 0.6556511521339417, 0.1220490038394928, 0.7263025641441345, -0.07173441350460052, 0.14413252472877502, 0.49090006947517395, -0.3324028253555298, 0.45898303389549255, 0.5931536555290222, 0.19021296501159668, -0.7473744750022888, -0.834629476070404, -0.1385311633348465, -0.05174582824110985, 0.018871335312724113, -0.42817312479019165, 0.20682017505168915, 0.016382897272706032, -0.6684255599975586, 0.3525462746620178, -0.42306870222091675, -0.0817568302154541, 0.3572525084018707, -0.23954586684703827, -0.4869120717048645, 0.016070470213890076, 0.5639761686325073, 0.17797298729419708, 0.2919785678386688, -0.3837592601776123, 0.13362792134284973, 0.09925093501806259, 0.12642522156238556, 0.09690988808870316, -0.08732952922582626, 0.24605968594551086, -0.3894798457622528, -0.174991175532341, 0.2573908269405365, 0.22514064610004425, -0.24535547196865082, -0.2993263006210327, 0.24350187182426453, 0.03375721350312233, 0.16244018077850342, -0.16753582656383514, -0.08621060848236084, 0.1272309273481369, 0.007472787983715534, 0.20557984709739685, 0.1578531116247177, -0.5838948488235474, 0.08410368114709854, -0.2831973135471344, -0.28126293420791626, -0.08023717254400253, 0.5180243849754333, 0.2208152413368225, -0.3613019585609436, -0.06204051896929741, -0.13526616990566254, 0.09384715557098389, -0.27185022830963135, -0.05938927084207535, 0.284194678068161, 0.04228530079126358, 0.5006632208824158, 0.6578063368797302, -0.07014274597167969, -0.3233219087123871, -0.01618030108511448, 0.2888641357421875, -0.08185673505067825, -0.17689819633960724, -0.2994365096092224, 0.016244128346443176, 0.02359011210501194, 0.1367129534482956, -0.01653127372264862, -0.09157261997461319, -0.3516620397567749, -0.09030301123857498, -0.07817772775888443, 0.17603041231632233, -0.01393663790076971, -0.029468189924955368, -0.0814921036362648, -0.12077502906322479, -0.10759524255990982, -0.0750858411192894, 0.2511105239391327, -0.20753242075443268, -0.05136517807841301, -0.024205535650253296, -0.3384825587272644, 0.020664114505052567, 0.11200296878814697, 0.08333364874124527, -0.24177855253219604, -0.07010341435670853, 0.020779477432370186, -0.20839253067970276, -0.0016562794335186481, 0.023504814133048058, 0.3570723235607147, -0.30022287368774414, -0.3554439842700958, -0.027536675333976746, -1.1282703876495361, -0.08706718683242798, 0.0742080882191658, 0.18080361187458038, -0.02274167723953724, -0.704075813293457, -0.9722687602043152, 0.1188407614827156, -0.029379399493336678, 0.8019110560417175, -0.34810709953308105, 0.04902748018503189, -0.7494327425956726, 0.5064789056777954, -0.11681736260652542, 0.2257058471441269, -0.4354608356952667, 0.3252757489681244, -0.1591869592666626, -0.5933760404586792, -0.5259361863136292, 0.22252318263053894, 0.30712220072746277, 0.29186123609542847, -0.7899709343910217, 0.3455640971660614, -0.8577526807785034, 0.19282177090644836, 0.29095181822776794, -0.3287593424320221, 0.0454283282160759, -0.5983009338378906, -0.08342050760984421, -0.8976981043815613, 0.10165920853614807, 0.13396088778972626, 0.2290259599685669, 0.02499830722808838, 0.7539560794830322, 0.1477266401052475, 0.3097168207168579, -0.3993585705757141, 0.0817292109131813, 0.038499560207128525, 0.048502497375011444, 0.10572300106287003, -0.17650842666625977, 0.30300378799438477, -0.3586488962173462, -0.09699319303035736, 0.28980425000190735, 0.1152607873082161, -0.30993735790252686, -0.3226162791252136, 0.2082981914281845, 0.08206543326377869, 0.09643732011318207, -0.09098457545042038, -0.09191355854272842, 0.04240717366337776, -0.08706614375114441, 0.3119218051433563, 0.24132680892944336, -0.5137639045715332, 0.03463784605264664, -0.29585450887680054, -0.3583862781524658, -0.09919128566980362, 0.5263358950614929, 0.19875890016555786, -0.4007430374622345, -0.044145308434963226, -0.24342355132102966, 0.16471655666828156, -0.25901785492897034, 0.012997856363654137, 0.3298455476760864, -0.23130790889263153, 0.4484388828277588, 0.35633817315101624, 0.26454973220825195, 0.15214529633522034, -0.12443697452545166, -0.405061811208725, 0.17236965894699097, -0.36522531509399414, -0.074102483689785, 0.09564346820116043, -0.26696014404296875, -0.7053405046463013, -0.4750596880912781, 0.2850874066352844, -0.42413032054901123, 0.3273111581802368, 0.013779409229755402, -0.7248923182487488, -0.49210208654403687, 0.5041399002075195, -0.14308881759643555, 0.629442036151886, -0.8470776677131653, 0.36798736453056335, -0.17092065513134003, 0.5437707304954529, -0.26034078001976013, -0.4502609074115753, 0.2898317873477936, -0.3266198933124542, 0.1681036651134491, 0.6064534783363342, 0.48974573612213135, -0.3461318910121918, -0.36192092299461365, 0.3675844371318817, -0.731248676776886, -0.21227769553661346, -0.4246974289417267, 0.17397946119308472, -0.3643985986709595, 0.205714613199234, 0.629838228225708, 0.10543780773878098, 0.010421440936625004, 0.6487590670585632, -0.685522198677063, 0.010746597312390804, 0.371294766664505, -0.68584144115448, 0.69797283411026, -0.39890381693840027, 0.2957388460636139, 0.10036955028772354, -0.31620606780052185, -0.5876231789588928, -0.5783882737159729, -0.4745366871356964, 0.20689401030540466, -0.2748165428638458, 0.34110450744628906, 0.817054033279419, 0.8686729073524475, -0.6139298677444458, -0.19506172835826874, -0.03448706120252609, 0.635860025882721, -0.38243091106414795, 0.8843176960945129, 0.08922040462493896, -0.8030375242233276, 0.01003911904990673, 0.49227485060691833, 0.02043282799422741, -0.1812848448753357, 0.8425045609474182, -0.18937410414218903, 0.2360723465681076, -0.0486280657351017, 0.1306903064250946, 0.44811540842056274, -0.09772484004497528, 0.3676001727581024, -0.10864408314228058, 0.10239739716053009, 0.26535993814468384, -0.19465096294879913, -0.05268852412700653, 0.013907784596085548, 0.11859709769487381, -0.008244873955845833, -0.12678827345371246, 0.16795198619365692, 0.09826375544071198, -0.13783332705497742, -0.32474759221076965, -0.018496913835406303, -0.12179988622665405, 0.22411927580833435, -0.10514824092388153, 0.038778163492679596, 0.33486974239349365, 0.31644245982170105, 0.05365574359893799, 0.24912847578525543, -0.31889432668685913, 0.24240325391292572, -0.19231560826301575, 0.18558776378631592, -0.022984078153967857, 0.11608095467090607, 0.15418484807014465, -0.14139854907989502, 0.01758008636534214, -0.12027571350336075, 0.2522386610507965, -0.2922046184539795, 0.049236513674259186, 0.19894357025623322, 0.39957553148269653, 0.3346879780292511, 0.3187335133552551, 0.4501717686653137, -0.8946970701217651, 0.18189306557178497, -0.08766483515501022, 0.2782788574695587, 0.3587392270565033, -0.33824455738067627, 0.6033147573471069, -0.6243746876716614, -0.6177958250045776, 0.6629742383956909, 0.4856598377227783, -0.3099081814289093, -0.678487241268158, 0.47894829511642456, -0.03139176964759827, 0.16848357021808624, -0.5739434957504272, -0.16708984971046448, 0.11146949231624603, 0.090438611805439, 0.4812713861465454, 0.5129365921020508, -0.7324693202972412, 0.26365718245506287, -0.4824923276901245, -0.5487518310546875, -0.20128659904003143, 0.5759150385856628, 0.3504473567008972, -0.36605504155158997, -0.4257725477218628, -0.25298258662223816, 0.512897789478302, -0.4181336462497711, -0.516604483127594, 0.37244912981987], + b2: &[0.14859354496002197, -0.018167857080698013, -0.3407953083515167, -0.14991576969623566, 0.4018653333187103, -0.2384500652551651, -0.4047893285751343, 0.15702210366725922, -0.3152092695236206, 0.29297566413879395, 0.26403820514678955, -0.2573520541191101, -0.11290331929922104], + } + } diff --git a/vendor/tract-linalg-0.22.1/src/arm64/cortex_a72.rs b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a72.rs new file mode 100644 index 000000000..6819f4da0 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a72.rs @@ -0,0 +1,4 @@ +use crate::frame::mmm::cost_model::CostModel; +pub fn models() -> Vec<(&'static str, CostModel<'static>)> { + vec![] +} diff --git a/vendor/tract-linalg-0.22.1/src/arm64/cortex_a73.rs b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a73.rs new file mode 100644 index 000000000..6819f4da0 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a73.rs @@ -0,0 +1,4 @@ +use crate::frame::mmm::cost_model::CostModel; +pub fn models() -> Vec<(&'static str, CostModel<'static>)> { + vec![] +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/block_quant/helpers.rs b/vendor/tract-linalg-0.22.1/src/frame/block_quant/helpers.rs new file mode 100644 index 000000000..1158880a6 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/block_quant/helpers.rs @@ -0,0 +1,65 @@ +use byteorder::{ReadBytesExt, WriteBytesExt, LE}; +use std::io::{Cursor, Read, Write}; +use tract_data::internal::*; + +pub struct NibbleReader { + second_half: Option, + reader: R, +} + +impl<'s> NibbleReader> { + pub fn for_slice(slice: &'s [u8]) -> Self { + NibbleReader::new(Cursor::new(slice)) + } +} + +impl NibbleReader { + pub fn new(reader: R) -> NibbleReader { + NibbleReader { reader, second_half: None } + } + + pub fn read_f16(&mut self) -> f16 { + assert!(self.second_half.is_none()); + f16::from_bits(self.reader.read_u16::().unwrap()) + } + + pub fn read_i4(&mut self) -> i8 { + if let Some(second) = self.second_half.take() { + second + } else { + let byte = self.reader.read_u8().unwrap(); + self.second_half = Some((byte >> 4) as i8); + (byte & 0x0F) as i8 + } + } +} + +pub struct NibbleWriter { + first_half: Option, + writer: W, +} + +impl<'s> NibbleWriter> { + pub fn for_slice(slice: &'s mut [u8]) -> Self { + NibbleWriter::new(Cursor::new(slice)) + } +} + +impl NibbleWriter { + pub fn new(writer: W) -> NibbleWriter { + NibbleWriter { writer, first_half: None } + } + + pub fn write_f16(&mut self, f: f16) { + assert!(self.first_half.is_none()); + self.writer.write_u16::(f.to_bits()).unwrap() + } + + pub fn write_i4(&mut self, q: i8) { + if let Some(first) = self.first_half.take() { + self.writer.write_u8(first as u8 | ((q as u8) << 4)).unwrap() + } else { + self.first_half = Some(q); + } + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/block_quant/mod.rs b/vendor/tract-linalg-0.22.1/src/frame/block_quant/mod.rs new file mode 100644 index 000000000..28a8fe456 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/block_quant/mod.rs @@ -0,0 +1,327 @@ +use downcast_rs::{impl_downcast, Downcast}; +use dyn_clone::{clone_box, DynClone}; +use dyn_hash::DynHash; +use num_traits::Zero; +use tract_data::internal::*; +use tract_data::itertools::Itertools; + +use std::alloc::Layout; +use std::borrow::Cow; +use std::fmt::{Debug, Display}; +use std::hash::Hash; +use std::sync::Arc; + +mod helpers; +mod q4_0; +mod value; + +pub use helpers::{NibbleReader, NibbleWriter}; +pub use q4_0::Q4_0; +pub use value::{BlockQuantFact, BlockQuantValue, PackedBlockQuantFact}; + +use crate::mmm::{EagerPackedInput, MMMInputFormat}; +use crate::pack::PackedFormat; + +use crate::WeightType; + +use super::mmm::MMMInputValue; + +pub trait BlockQuant: Debug + Display + Send + Sync + DynClone + DynHash + Downcast { + fn same_as(&self, other: &dyn BlockQuant) -> bool; + + fn block_len(&self) -> usize; + + fn block_bytes(&self) -> usize; + + fn dequant_block_f32(&self, quant: &[u8], block: &mut [f32]); + fn dequant_block_f16(&self, quant: &[u8], block: &mut [f16]); + fn quant_block_f16(&self, block: &[f16], quant: &mut [u8]); + fn quant_block_f32(&self, block: &[f32], quant: &mut [u8]); + + fn quant_f16(&self, input: &[f16]) -> TractResult { + unsafe { + let blocks = input.len() / self.block_len(); + let mut quant = Blob::for_layout( + Layout::from_size_align(blocks * self.block_bytes(), 128).unwrap(), + ); + for b in 0..blocks { + let block = &input[b * self.block_len()..][..self.block_len()]; + let qblock = &mut quant[b * self.block_bytes()..][..self.block_bytes()]; + self.quant_block_f16(block, qblock); + } + Ok(quant) + } + } + + fn quant_f32(&self, input: &[f32]) -> TractResult { + unsafe { + let blocks = input.len() / self.block_len(); + let mut quant = Blob::for_layout( + Layout::from_size_align(blocks * self.block_bytes(), 128).unwrap(), + ); + for b in 0..blocks { + let block = &input[b * self.block_len()..][..self.block_len()]; + let qblock = &mut quant[b * self.block_bytes()..][..self.block_bytes()]; + self.quant_block_f32(block, qblock); + } + Ok(quant) + } + } + + fn dequant_f32(&self, input: &[u8]) -> TractResult { + unsafe { + let blocks = input.len() / self.block_bytes(); + let mut tensor = Tensor::uninitialized::(&[blocks * self.block_len()])?; + let slice = tensor.as_slice_mut::()?; + for b in 0..blocks { + let block = &mut slice[b * self.block_len()..][..self.block_len()]; + let qblock = &input[b * self.block_bytes()..][..self.block_bytes()]; + self.dequant_block_f32(qblock, block); + } + Ok(tensor) + } + } + + fn dequant_f16(&self, input: &[u8]) -> TractResult { + unsafe { + let blocks = input.len() / self.block_bytes(); + let mut tensor = Tensor::uninitialized::(&[blocks * self.block_len()])?; + let slice = tensor.as_slice_mut::()?; + for b in 0..blocks { + let block = &mut slice[b * self.block_len()..][..self.block_len()]; + let qblock = &input[b * self.block_bytes()..][..self.block_bytes()]; + self.dequant_block_f16(qblock, block); + } + Ok(tensor) + } + } + + fn extract_at_offset_f16(&self, input: &[u8], offset: usize) -> f16 { + let len = self.block_len(); + let block_id = offset / len; + let mut block = vec![f16::zero(); self.block_len()]; + self.dequant_block_f16( + &input[block_id * self.block_bytes()..][..self.block_bytes()], + &mut block, + ); + block[offset % len] + } + + fn extract_at_offset_f32(&self, input: &[u8], offset: usize) -> f32 { + let len = self.block_len(); + let block_id = offset / len; + let mut block = vec![f32::zero(); self.block_len()]; + self.dequant_block_f32( + &input[block_id * self.block_bytes()..][..self.block_bytes()], + &mut block, + ); + block[offset % len] + } + + fn simulate_precision_loss( + &self, + mut tensor: Tensor, + block_axis: usize, + ) -> TractResult { + ensure!(block_axis == tensor.rank() - 1); + ensure!(tensor.shape()[block_axis] % self.block_len() == 0); + let mut scratch = vec![0u8; self.block_bytes()]; + if tensor.datum_type() == f32::datum_type() { + for block in tensor.as_slice_mut::()?.chunks_mut(self.block_len()) { + self.quant_block_f32(block, &mut scratch); + self.dequant_block_f32(&scratch, block); + } + Ok(tensor) + } else if tensor.datum_type() == f16::datum_type() { + for block in tensor.as_slice_mut::()?.chunks_mut(self.block_len()) { + self.quant_block_f16(block, &mut scratch); + self.dequant_block_f16(&scratch, block); + } + Ok(tensor) + } else { + todo!() + } + } + + fn pack( + &self, + input: &[u8], + k: usize, + r: usize, + zip: usize, + scales_at_end: bool, + ) -> TractResult; + + unsafe fn extract_packed_panel( + &self, + value: &EagerPackedInput, + target: &PackedFormat, + panel: usize, + scratch: *mut u8, + ) -> TractResult<()>; + + fn extract_at_mn_f16( + &self, + value: &EagerPackedInput, + mn: usize, + target: &mut [f16], + ) -> TractResult<()>; + + fn extract_at_mn_f32( + &self, + value: &EagerPackedInput, + mn: usize, + target: &mut [f32], + ) -> TractResult<()>; +} + +dyn_clone::clone_trait_object!(BlockQuant); +dyn_hash::hash_trait_object!(BlockQuant); +impl_downcast!(BlockQuant); + +#[allow(clippy::derived_hash_with_manual_eq)] +#[derive(Clone, Hash)] +pub struct PackedBlockQuantFormat { + pub bq: Box, + pub r: usize, + pub zip: usize, + pub scales_at_end: bool, +} + +impl PartialEq for PackedBlockQuantFormat { + fn eq(&self, other: &Self) -> bool { + self.bq.same_as(&*other.bq) + && self.r == other.r + && self.zip == other.zip + && self.scales_at_end == other.scales_at_end + } +} + +impl Eq for PackedBlockQuantFormat {} + +impl Display for PackedBlockQuantFormat { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Packed{}[{}]", &*self.bq, self.r)?; + if self.zip != 0 { + write!(f, "Z{}", self.zip)?; + } + if self.scales_at_end { + write!(f, "Se")?; + } + Ok(()) + } +} + +impl Debug for PackedBlockQuantFormat { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + ::fmt(self, f) + } +} + +impl PackedBlockQuantFormat { + pub fn new(bq: &dyn BlockQuant, r: usize, zip: usize, scales_at_end: bool) -> Self { + PackedBlockQuantFormat { bq: clone_box(bq), r, zip, scales_at_end } + } + + pub fn simulate_precision_loss( + &self, + tensor: Tensor, + block_axis: usize, + ) -> TractResult { + self.bq.simulate_precision_loss(tensor, block_axis) + } + + pub fn pack(&self, input: &[u8], k: usize) -> TractResult { + self.bq.pack(input, k, self.r, self.zip, self.scales_at_end) + } +} + +impl MMMInputFormat for PackedBlockQuantFormat { + fn prepare_tensor(&self, t: &Tensor, _k_axis: usize, _mn_axis: usize) -> TractResult { + let packed = t + .as_slice::()? + .iter() + .map(|o| { + let bqv = o.downcast_ref::().unwrap(); + let packed = self.pack(&bqv.value, bqv.fact.k())?; + Ok(Opaque(Arc::new(Box::new(packed) as Box))) + }) + .collect::>>()?; + tensor1(&packed).into_shape(t.shape()) + } + + fn prepare_one( + &self, + t: &Tensor, + k_axis: usize, + mn_axis: usize, + ) -> TractResult> { + // this code path is essentially there for test scenarios + let t = if t.datum_type().is_number() { + let k = t.shape()[k_axis]; + let m = t.shape()[mn_axis]; + assert!(k % self.bq.block_len() == 0); + let t: Cow = if k_axis == 1 && mn_axis == 0 { + Cow::Borrowed(t) + } else { + Cow::Owned(t.clone().move_axis(1, 0)?) + }; + let quant = if t.datum_type() == f32::datum_type() { + self.bq.quant_f32(t.as_slice()?)? + } else if t.datum_type() == f16::datum_type() { + self.bq.quant_f16(t.as_slice()?)? + } else { + todo!() + }; + Cow::Owned(tensor0(Opaque(Arc::new(BlockQuantValue { + value: Arc::new(quant), + fact: BlockQuantFact::new(self.bq.clone(), tvec!(m, k)), + })))) + } else { + Cow::Borrowed(t) + }; + ensure!(mn_axis == 0); + ensure!(k_axis == 1); + let bqv = t.to_scalar::()?.downcast_ref::().unwrap(); + let packed = self.pack(&bqv.value, bqv.fact.k())?; + Ok(Box::new(packed)) + } + + fn precursor(&self) -> WeightType { + WeightType::BlockQuant(self.bq.clone()) + } + + fn k_alignment(&self) -> usize { + self.bq.block_len() + } + + fn r(&self) -> usize { + self.r + } + + fn mem_size(&self, k: TDim, mn: TDim) -> TDim { + k * mn * self.bq.block_bytes() / self.bq.block_len() + } + + fn same_as(&self, other: &dyn MMMInputFormat) -> bool { + other.downcast_ref::().is_some_and(|other| self == other) + } + + fn extract_at_mn_f16( + &self, + data: &EagerPackedInput, + mn: usize, + slice: &mut [f16], + ) -> TractResult<()> { + self.bq.extract_at_mn_f16(data, mn, slice) + } + + fn extract_at_mn_f32( + &self, + data: &EagerPackedInput, + mn: usize, + slice: &mut [f32], + ) -> TractResult<()> { + self.bq.extract_at_mn_f32(data, mn, slice) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/block_quant/q4_0.rs b/vendor/tract-linalg-0.22.1/src/frame/block_quant/q4_0.rs new file mode 100644 index 000000000..44d12ca81 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/block_quant/q4_0.rs @@ -0,0 +1,509 @@ +use crate::mmm::PackedOpaqueFact; + +use super::*; +use num_traits::{AsPrimitive, Float, Zero}; +use std::alloc::Layout; + +#[derive(Copy, Clone, Hash, PartialEq, Eq)] +pub struct BaseQ4_0; + +pub const Q4_0: BaseQ4_0 = BaseQ4_0::<32>; + +impl Debug for BaseQ4_0 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if QK == 32 { + write!(f, "Q4_0") + } else { + write!(f, "BaseQ4_0<{QK}>") + } + } +} + +impl BaseQ4_0 { + fn quant_block(&self, block: &[T], quant: &mut [u8]) + where + f32: AsPrimitive, + T: Debug + Float + AsPrimitive + AsPrimitive + 'static, + { + assert!(quant.len() == self.block_bytes()); + assert!(block.len() == self.block_len()); + let mut writer = NibbleWriter::for_slice(quant); + let mut amax = T::zero(); + let mut max = T::zero(); + for v in block { + if amax < v.abs() { + amax = v.abs(); + max = *v; + } + } + let scale: T = max / (-8f32).as_(); + let r_scale = if scale.is_zero() { T::zero() } else { scale.recip() }; + writer.write_f16(scale.as_()); + + for idx in 0..block.len() { + // Quant block in GGML nibble order + let ggml_idx = (block.len() / 2) * (idx % 2) + (idx / 2); + let i: i8 = (block[ggml_idx] * r_scale + (8.5f32).as_()).as_(); + writer.write_i4(i.min(15)); + } + } + + fn dequant_block(&self, quant: &[u8], block: &mut [T]) + where + f16: AsPrimitive, + i8: AsPrimitive, + { + assert!(quant.len() == self.block_bytes()); + assert!(block.len() == self.block_len()); + let mut nibbles = NibbleReader::for_slice(quant); + let d: T = nibbles.read_f16().as_(); + for idx in 0..block.len() { + let ggml_idx = (block.len() / 2) * (idx % 2) + (idx / 2); + block[ggml_idx] = (nibbles.read_i4() - 8).as_() * d; + } + } + + unsafe fn extract_panel_t( + &self, + value: &EagerPackedInput, + target: &PackedFormat, + panel: usize, + scratch: *mut u8, + ) -> TractResult<()> + where + f16: AsPrimitive, + i8: AsPrimitive, + { + let pbqf: &PackedBlockQuantFormat = + value.fact.format.downcast_ref().with_context(|| { + format!("Expecing PackedBlockQuantFormat, found {:?}", value.fact.format) + })?; + ensure!(pbqf.r == target.r); + ensure!(value.fact.k % self.block_len() == 0); + ensure!(pbqf.bq.same_as(self)); + let scratch = + unsafe { std::slice::from_raw_parts_mut(scratch as *mut T, value.fact.k * target.r) }; + let blocks_for_k = value.fact.k / self.block_len(); + let row_bytes = blocks_for_k * self.block_bytes(); + let input = &value.packed[panel * target.r * row_bytes..]; + let mut scales = vec![T::zero(); target.r]; + let mut scratch = scratch.iter_mut(); + let zipped_order = zipped_order(pbqf.r, pbqf.zip); + let mut weights = vec![0i8; pbqf.r]; + let panel_block_bytes = target.r * self.block_bytes(); + let (scale_offset, weights_offset) = if pbqf.scales_at_end { + (panel_block_bytes - target.r * f16::datum_type().size_of(), 0) + } else { + (0, target.r * f16::datum_type().size_of()) + }; + for block in 0..blocks_for_k { + let block = &input[block * panel_block_bytes..][..panel_block_bytes]; + let mut s_reader = NibbleReader::for_slice(&block[scale_offset..]); + let mut w_reader = NibbleReader::for_slice(&block[weights_offset..]); + for s in &mut scales { + *s = s_reader.read_f16().as_(); + } + for _ in 0..self.block_len() { + for &o in &zipped_order { + weights[o] = w_reader.read_i4(); + } + for (w, s) in weights.iter().zip(scales.iter()) { + *scratch.next().unwrap() = *s * (*w - 8).as_(); + } + } + } + Ok(()) + } + + fn extract_at_mn_t( + &self, + value: &EagerPackedInput, + mn: usize, + target: &mut [T], + ) -> TractResult<()> + where + f16: AsPrimitive, + i8: AsPrimitive, + { + let pbqf: &PackedBlockQuantFormat = + value.fact.format.downcast_ref().with_context(|| { + format!("Expecing PackedBlockQuantFormat, found {:?}", value.fact.format) + })?; + ensure!(value.fact.k % self.block_len() == 0); + ensure!(pbqf.bq.same_as(self)); + ensure!(value.fact.mn.to_usize().ok().map(|it| mn < it).unwrap_or(true)); + ensure!(value.fact.k == target.len()); + let blocks_for_k = value.fact.k / self.block_len(); + let row_bytes = blocks_for_k * self.block_bytes(); + let panel = mn / pbqf.r; + let value = &value.packed[panel * pbqf.r * row_bytes..]; + let mut target = target.iter_mut(); + let zipped_order = + zipped_order(pbqf.r, pbqf.zip).iter().position(|x| *x == mn % pbqf.r).unwrap(); + + let panel_block_bytes = pbqf.r * self.block_bytes(); + let (scale_offset, weights_offset) = if pbqf.scales_at_end { + (panel_block_bytes - pbqf.r * f16::datum_type().size_of(), 0) + } else { + (0, pbqf.r * f16::datum_type().size_of()) + }; + unsafe { + for block in 0..blocks_for_k { + let block = value.as_ptr().add(block * panel_block_bytes); + let scale = *((block.add(scale_offset) as *const f16).add(mn % pbqf.r)); + let scale: T = scale.as_(); + for i in 0..self.block_len() { + let byte = *block.add(weights_offset + i * pbqf.r / 2 + zipped_order / 2); + let nib = if zipped_order % 2 == 0 { byte & 0x0F } else { byte >> 4 }; + *target.next().unwrap() = scale * ((nib as i8) - 8).as_(); + } + } + } + Ok(()) + } +} + +fn zipped_order(r: usize, zip: usize) -> Vec { + if zip == 0 { + (0..r).collect_vec() + } else { + (0..r) + .map(|i| { + let vec_pair_ix = i / (2 * zip); + let lane = (i % (2 * zip)) / 2; + let side = i % 2; + vec_pair_ix * 2 * zip + side * zip + lane + }) + .collect_vec() + } +} + +impl BlockQuant for BaseQ4_0 { + fn same_as(&self, other: &dyn BlockQuant) -> bool { + other.downcast_ref::().map(|other| other == self).unwrap_or(false) + } + + fn block_len(&self) -> usize { + QK + } + + fn block_bytes(&self) -> usize { + 2 + self.block_len() / 2 + } + + fn quant_block_f32(&self, block: &[f32], quant: &mut [u8]) { + self.quant_block(block, quant) + } + + fn quant_block_f16(&self, block: &[f16], quant: &mut [u8]) { + self.quant_block(block, quant) + } + + fn dequant_block_f32(&self, quant: &[u8], block: &mut [f32]) { + self.dequant_block(quant, block) + } + + fn dequant_block_f16(&self, quant: &[u8], block: &mut [f16]) { + self.dequant_block(quant, block) + } + + // s0_0 n0_0 n0_1 n0_2 n0_3 ... n0_30n0_31 s0_32 n0_32n0_33 ... + // s1_0 n1_0 n1_1 n1_2 n1_3 ... n1_30n1_31 s1_32 n1_32n1_33 ... + // + // becomes (with r=4) + // + // s0_0 S1_0 S2_0 s3_0 n0_0 n1_0 n2_0 n3_0 n0_1 n1_1 n2_1 n3_1 ... n0_33 n1_33 n2_33 n3_33 + // s0_32 S1_32 S2_32 s3_32 n0_0 n1_0 n2_0 n3_0 n0_1 n1_1 n2_1 n3_1 ... n0_33 n1_33 n2_33 n3_33 + // ... + fn pack( + &self, + input: &[u8], + k: usize, + r: usize, + zip: usize, + scales_at_end: bool, + ) -> TractResult { + ensure!(input.len() % self.block_bytes() == 0); + ensure!(k % self.block_len() == 0); + // ensure!(input.len() == k * r / self.block_len() * self.block_bytes()); + ensure!(zip < r); + let m = if input.len() == 0 { + 0 + } else { + input.len() / self.block_bytes() * self.block_len() / k + }; + let panels = m.divceil(r); + let blocks_for_k = k / self.block_len(); + let row_bytes = blocks_for_k * self.block_bytes(); + let panel_bytes = row_bytes * r; + let mut blob = + unsafe { Blob::for_layout(Layout::from_size_align(panel_bytes * panels, 128)?) }; + let mut writer = NibbleWriter::for_slice(&mut blob); + let order = zipped_order(r, zip); + let mut scales = vec![f16::zero(); r]; + for p in 0..panels { + let input = &input[(r * p) * row_bytes..]; + let mut readers = (0..r) + .map(|r| { + // manage partial panel + let offset = if r * row_bytes < input.len() { r * row_bytes } else { 0 }; + NibbleReader::for_slice(&input[offset..]) + }) + .collect_vec(); + let mut temp_nibbles = vec![vec![0i8; self.block_len()]; r]; + for _ in 0..blocks_for_k { + for (row, reader) in readers.iter_mut().enumerate() { + scales[row] = reader.read_f16(); + temp_nibbles[row] = + (0..self.block_len()).map(|_| reader.read_i4()).collect_vec(); + } + if !scales_at_end { + scales.iter().for_each(|s| writer.write_f16(*s)) + } + for pos in 0..self.block_len() { + for &row in &order { + let ggml_idx = pos / (self.block_len() / 2) + (2 * pos) % self.block_len(); + let nib = temp_nibbles[row][ggml_idx]; + writer.write_i4(nib); + } + } + if scales_at_end { + scales.iter().for_each(|s| writer.write_f16(*s)) + } + } + } + Ok(EagerPackedInput { + fact: PackedOpaqueFact { + format: Box::new(PackedBlockQuantFormat { + bq: Box::new(*self), + r, + zip, + scales_at_end, + }), + mn: m.to_dim(), + k, + }, + packed: blob.into(), + panel_bytes, + mn: m, + }) + } + + unsafe fn extract_packed_panel( + &self, + value: &EagerPackedInput, + target: &PackedFormat, + panel: usize, + scratch: *mut u8, + ) -> TractResult<()> { + unsafe { + dispatch_floatlike!(Self::extract_panel_t(target.dt)( + self, value, target, panel, scratch + )) + } + } + + fn extract_at_mn_f16( + &self, + value: &EagerPackedInput, + mn: usize, + target: &mut [f16], + ) -> TractResult<()> { + self.extract_at_mn_t(value, mn, target) + } + + fn extract_at_mn_f32( + &self, + value: &EagerPackedInput, + mn: usize, + target: &mut [f32], + ) -> TractResult<()> { + self.extract_at_mn_t(value, mn, target) + } +} + +impl Display for BaseQ4_0 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Q4_0") + } +} + +#[cfg(test)] +mod tests { + use num_traits::Zero; + use tract_data::internal::tract_ndarray::Array2; + + use crate::pack::PackedFormat; + + use super::*; + + fn test_loop_f32(b: impl BlockQuant, data: &[f32]) { + let mut input = data.to_vec(); + while input.len() % b.block_len() != 0 { + input.push(0f32); + } + let quant = b.quant_f32(&input).unwrap(); + let result = b.dequant_f32(&quant).unwrap(); + let view = &result.as_slice::().unwrap()[..data.len()]; + assert_eq!(data, view); + } + + fn test_loop_f16(b: impl BlockQuant, data: &[f32]) { + let mut input = data.iter().map(|f| f16::from_f32(*f)).collect_vec(); + while input.len() % b.block_len() != 0 { + input.push(f16::zero()); + } + let quant = b.quant_f16(&input).unwrap(); + let result = b.dequant_f16(&quant).unwrap(); + let view = &result.as_slice::().unwrap(); + assert_eq!(&input, view); + } + + #[test] + fn loop_q4f32_pos() { + test_loop_f32(Q4_0, &[1.0, 2.0, 3.0, 4.0]); + } + + #[test] + fn loop_q4f16_pos() { + test_loop_f16(Q4_0, &[1.0, 2.0, 3.0, 4.0]); + } + + #[test] + fn loop_q4f32_neg() { + test_loop_f32(Q4_0, &[-1.0, -2.0, -3.0, -4.0]); + } + + #[test] + fn loop_q4f16_beg() { + test_loop_f16(Q4_0, &[-1.0, -2.0, -3.0, -4.0]); + } + + #[test] + fn loop_q4_big_pos() { + test_loop_f32(Q4_0, &[1234.0]); + test_loop_f16(Q4_0, &[-1.0, -2.0, -3.0, -4.0]); + } + + #[test] + fn loop_q4_big_neg() { + test_loop_f32(Q4_0, &[-1234.0]); + test_loop_f16(Q4_0, &[-1234.0]); + } + + fn test_extract_f32(b: impl BlockQuant, data: &[f32]) { + let mut input = data.to_vec(); + while input.len() % b.block_len() != 0 { + input.push(0f32); + } + let quant = b.quant_f32(&input).unwrap(); + for (ix, v) in data.iter().enumerate() { + assert_eq!(b.extract_at_offset_f32(&quant, ix).round(), *v); + } + } + + #[test] + fn extract_q40f32_pos() { + let data = (1..).map(|i| ((i % 14) - 6) as f32).take(5 * Q4_0.block_len()).collect_vec(); + test_extract_f32(Q4_0, &data); + } + + fn test_pack_then_extract_panel( + q: impl BlockQuant, + k: usize, + m: usize, + r: usize, + zip: usize, + scales_at_end: bool, + ) -> TractResult<()> { + let weights_orig = + Array2::from_shape_fn((m, k), |(m, k)| ((m * 31 + k * 17) % 20) as f32 - 10.) + .into_tensor(); + let weights_f32 = + q.dequant_f32(&q.quant_f32(weights_orig.as_slice::()?)?)?.into_shape(&[m, k])?; + let packer = PackedFormat::new(f32::datum_type(), r, 128); + let packed_f32 = packer.pack_tensor(&weights_f32, 1, 0)?; + + let q4 = q.quant_f32(weights_f32.as_slice::()?)?; + let packed_q4 = q.pack(&q4, k, r, zip, scales_at_end)?; + + for panel in 0..packed_f32.panels_count() { + unsafe { + let panel_f32 = packed_f32.panel_bytes(panel, None)?; + let panel_f32 = std::slice::from_raw_parts(panel_f32 as *const f32, k * r); + let mut panel_q4 = Tensor::zero::(&[k * r])?; + q.extract_packed_panel( + &packed_q4, + &packer, + panel, + panel_q4.as_bytes_mut().as_mut_ptr(), + )?; + assert_eq!(panel_q4.as_slice::()?, panel_f32); + } + } + Ok(()) + } + + #[test] + fn pack_then_extract_panel() -> TractResult<()> { + test_pack_then_extract_panel(BaseQ4_0::<2>, 4, 4, 2, 0, false) + } + + #[test] + fn pack_then_extract_panel_with_zip() -> TractResult<()> { + test_pack_then_extract_panel(BaseQ4_0::<2>, 2, 8, 8, 4, false) + } + + #[test] + fn pack_then_extract_panel_with_scales_at_end() -> TractResult<()> { + test_pack_then_extract_panel(BaseQ4_0::<2>, 2, 4, 4, 0, true) + } + + fn test_pack_then_extract_row( + q: impl BlockQuant, + k: usize, + m: usize, + r: usize, + zip: usize, + scales_at_end: bool, + ) -> TractResult<()> { + let weights_orig = + Array2::from_shape_fn((m, k), |(m, k)| ((m * 31 + k * 17) % 20) as f32 - 10.) + .into_tensor(); + let weights_f32 = + q.dequant_f32(&q.quant_f32(weights_orig.as_slice::()?)?)?.into_shape(&[m, k])?; + let packer = PackedFormat::new(f32::datum_type(), r, 128); + let packed_f32 = packer.pack_tensor(&weights_f32, 1, 0)?; + + let q4 = q.quant_f32(weights_f32.as_slice::()?)?; + let packed_q4 = q.pack(&q4, k, r, zip, scales_at_end)?; + + for row in 0..packed_f32.mn() { + unsafe { + let panel_f32 = packed_f32.panel_bytes(row / r, None)?; + let panel_f32 = std::slice::from_raw_parts(panel_f32 as *const f32, k * r); + let row_f32 = (0..k).map(|ix| panel_f32[row % r + r * ix]).collect_vec(); + + let mut q4 = vec![0f32; k]; + q.extract_at_mn_f32(&packed_q4, row, &mut q4)?; + assert_eq!(q4, row_f32); + } + } + Ok(()) + } + + #[test] + fn pack_then_extract_row() -> TractResult<()> { + test_pack_then_extract_row(BaseQ4_0::<2>, 4, 4, 2, 0, false) + } + + #[test] + fn pack_then_extract_row_with_zip() -> TractResult<()> { + test_pack_then_extract_row(BaseQ4_0::<2>, 2, 8, 8, 4, false) + } + + #[test] + fn pack_then_extract_row_with_scales_at_end() -> TractResult<()> { + test_pack_then_extract_row(BaseQ4_0::<2>, 2, 4, 4, 0, true) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/block_quant/value.rs b/vendor/tract-linalg-0.22.1/src/frame/block_quant/value.rs new file mode 100644 index 000000000..3b564fa1a --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/block_quant/value.rs @@ -0,0 +1,116 @@ +use std::ops::Range; +use std::sync::Arc; + +use super::{BlockQuant, PackedBlockQuantFormat}; +use tract_data::internal::*; +use tract_data::TVec; + +#[allow(clippy::derived_hash_with_manual_eq)] +#[derive(Clone, Hash)] +pub struct BlockQuantFact { + pub format: Box, + shape: TVec, +} +impl BlockQuantFact { + pub fn new(format: Box, shape: TVec) -> Self { + Self { format, shape } + } + + pub fn m(&self) -> usize { + self.shape[0] + } + + pub fn k(&self) -> usize { + self.shape.iter().skip(1).product() + } + + pub fn shape(&self) -> &[usize] { + &self.shape + } +} + +impl std::fmt::Debug for BlockQuantFact { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}({:?})", self.format, self.shape) + } +} + +impl OpaqueFact for BlockQuantFact { + fn mem_size(&self) -> TDim { + (self.shape.iter().product::() / self.format.block_len() * self.format.block_bytes()) + .to_dim() + } + + fn same_as(&self, other: &dyn OpaqueFact) -> bool { + other.downcast_ref::().is_some_and(|o| o == self) + } +} + +impl PartialEq for BlockQuantFact { + fn eq(&self, other: &Self) -> bool { + self.format.same_as(&*other.format) && self.shape == other.shape + } +} + +#[derive(Clone, Hash)] +pub struct BlockQuantValue { + pub fact: BlockQuantFact, + pub value: Arc, +} + +impl BlockQuantValue { + pub fn split_rows(&self, range: Range) -> TractResult { + let row_bytes = + self.fact.k() / self.fact.format.block_len() * self.fact.format.block_bytes(); + let mut value = + unsafe { Blob::new_for_size_and_align(range.len() * row_bytes, vector_size()) }; + value.copy_from_slice(&self.value[range.start * row_bytes..][..range.len() * row_bytes]); + let mut shape = self.fact.shape.clone(); + shape[0] = range.len(); + Ok(BlockQuantValue { + fact: BlockQuantFact { format: self.fact.format.clone(), shape }, + value: Arc::new(value), + }) + } +} + +impl OpaquePayload for BlockQuantValue { + fn same_as(&self, other: &dyn OpaquePayload) -> bool { + other.downcast_ref::().is_some_and(|o| o.fact == self.fact && o.value == self.value) + } +} + +impl std::fmt::Debug for BlockQuantValue { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?} {:?}", self.fact, self.value) + } +} + +impl std::fmt::Display for BlockQuantValue { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:?}") + } +} + +#[derive(Clone, Hash, PartialEq)] +pub struct PackedBlockQuantFact { + pub format: PackedBlockQuantFormat, + pub shape: TVec, +} + +impl std::fmt::Debug for PackedBlockQuantFact { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}({:?})", self.format, self.shape) + } +} + +impl OpaqueFact for PackedBlockQuantFact { + fn mem_size(&self) -> TDim { + (self.shape.iter().product::() / self.format.bq.block_len() + * self.format.bq.block_bytes()) + .to_dim() + } + fn same_as(&self, other: &dyn OpaqueFact) -> bool { + other.downcast_ref::().is_some_and(|o| o == self) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/by_scalar.rs b/vendor/tract-linalg-0.22.1/src/frame/by_scalar.rs new file mode 100644 index 000000000..0405043e8 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/by_scalar.rs @@ -0,0 +1,96 @@ +use std::fmt::Debug; +use std::marker::PhantomData; + +use crate::element_wise::{ElementWise, ElementWiseKer}; +use crate::element_wise_helper::map_slice_with_alignment; +use crate::{LADatum, LinalgFn}; +use tract_data::internal::*; + +/// Generic implementation struct that unify all by scalar kernels. +/// A by scalar operation is an ElementWise operation with a scalar paramerer. +#[derive(Debug, Clone, new)] +pub struct ByScalarImpl +where + T: LADatum, + K: ByScalarKer + Clone, +{ + phantom: PhantomData<(K, T)>, +} + +impl ElementWise for ByScalarImpl +where + T: LADatum, + K: ByScalarKer + Clone, +{ + fn name(&self) -> &'static str { + K::name() + } + fn run_with_params(&self, vec: &mut [T], params: T) -> TractResult<()> { + map_slice_with_alignment(vec, |data| K::run(data, params), K::nr(), K::alignment_bytes()) + } +} + +pub trait ByScalarKer: ElementWiseKer +where + T: LADatum, +{ + fn bin() -> Box { + Box::new(|a: &mut TensorView, b: &TensorView| { + let a_slice = a.as_slice_mut()?; + let b = b.as_slice()?[0]; + (Self::ew()).run_with_params(a_slice, b) + }) + } +} + +macro_rules! by_scalar_impl_wrap { + ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $params: ty, $run: item) => { + paste! { + ew_impl_wrap!($ti, $func, $nr, $alignment_items, $ti, $run); + + impl crate::frame::by_scalar::ByScalarKer<$ti> for $func {} + } + }; +} + +#[cfg(test)] +#[macro_use] +pub mod test { + use crate::frame::element_wise::ElementWiseKer; + use crate::LADatum; + use num_traits::{AsPrimitive, Float}; + use proptest::test_runner::TestCaseResult; + + #[macro_export] + macro_rules! by_scalar_frame_tests { + ($cond:expr, $t: ty, $ker:ty, $func:expr) => { + pastey::paste! { + proptest::proptest! { + #[test] + fn [](xs in proptest::collection::vec(-25f32..25.0, 0..100), scalar in -25f32..25f32) { + if $cond { + $crate::frame::by_scalar::test::test_by_scalar::<$ker, $t>(&*xs, scalar, $func).unwrap() + } + } + } + } + }; + } + + pub fn test_by_scalar, T: LADatum + Float>( + values: &[f32], + scalar: f32, + func: impl Fn(T, T) -> T, + ) -> TestCaseResult + where + f32: AsPrimitive, + { + crate::setup_test_logger(); + let values: Vec = values.iter().copied().map(|x| x.as_()).collect(); + crate::frame::element_wise::test::test_element_wise_params::( + &values, + |a| (func)(a, scalar.as_()), + scalar.as_(), + ) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/element_wise.rs b/vendor/tract-linalg-0.22.1/src/frame/element_wise.rs new file mode 100644 index 000000000..824ec36c1 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/element_wise.rs @@ -0,0 +1,165 @@ +use std::fmt::Debug; +use std::marker::PhantomData; + +use tract_data::TractResult; + +use crate::LADatum; + +use super::element_wise_helper::map_slice_with_alignment; + +macro_rules! ew_impl_wrap { + ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $params: ty, $run: item) => { + paste! { + #[derive(Copy, Clone, Debug)] + #[allow(non_camel_case_types)] + pub struct $func; + + impl crate::frame::element_wise::ElementWiseKer<$ti, $params> for $func { + #[inline(always)] + fn name() -> &'static str { + stringify!($func) + } + #[inline(always)] + fn nr() -> usize { + $nr + } + #[inline(always)] + fn alignment_items() -> usize { + $alignment_items + } + $run + } + } + }; +} + +macro_rules! ew_impl { + ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr) => { + paste! { + mod [] { + #[allow(unused_imports)] + use tract_data::prelude::f16; + extern_kernel!(fn $func(ptr: *mut $ti, count: usize) -> ()); + } + ew_impl_wrap!($ti, $func, $nr, $alignment_items, (), + #[inline(never)] + fn run(buf: &mut [$ti], _params: ()) { + unsafe { []::$func(buf.as_mut_ptr(), buf.len()) } + } + ); + } + }; + ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $params: ty) => { + paste! { + mod [] { + #[allow(unused_imports)] + use tract_data::prelude::f16; + extern_kernel!(fn $func(ptr: *mut $ti, count: usize, params: $params) -> ()); + } + ew_impl_wrap!($ti, $func, $nr, $alignment_items, $params, + #[inline(never)] + fn run(buf: &mut [$ti], params: $params) { + unsafe { []::$func(buf.as_mut_ptr(), buf.len(), params) } + } + ); + } + }; +} + +pub trait ElementWise: Send + Sync + Debug + dyn_clone::DynClone +where + Params: Copy + Send + Sync + Debug + 'static + Default, + T: Copy + Debug + PartialEq + Send + Sync, +{ + fn name(&self) -> &'static str; + fn run(&self, vec: &mut [T]) -> TractResult<()> { + self.run_with_params(vec, Params::default()) + } + fn run_with_params(&self, vec: &mut [T], params: Params) -> TractResult<()>; +} + +dyn_clone::clone_trait_object!( ElementWise where T: Copy, Params: Copy); + +#[derive(Debug, Clone, new)] +pub struct ElementWiseImpl +where + T: LADatum, + Params: Copy + Send + Sync + Debug + 'static + Default, + K: ElementWiseKer + Clone, +{ + phantom: PhantomData<(K, T, Params)>, +} + +impl ElementWise for ElementWiseImpl +where + T: LADatum, + Params: Copy + Send + Sync + Debug + 'static + Default, + K: ElementWiseKer + Clone, +{ + fn name(&self) -> &'static str { + K::name() + } + fn run_with_params(&self, vec: &mut [T], params: Params) -> TractResult<()> { + map_slice_with_alignment(vec, |data| K::run(data, params), K::nr(), K::alignment_bytes()) + } +} + +pub trait ElementWiseKer: + Send + Sync + Debug + dyn_clone::DynClone + Clone + 'static +where + Params: Copy + Send + Sync + Debug + 'static + Default, + T: LADatum, +{ + fn name() -> &'static str; + fn alignment_bytes() -> usize { + Self::alignment_items() * T::datum_type().size_of() + } + fn alignment_items() -> usize; + fn nr() -> usize; + fn run(vec: &mut [T], params: Params); + fn ew() -> Box> { + Box::new(ElementWiseImpl::::new()) + } +} + +#[cfg(test)] +pub mod test { + use crate::{frame::element_wise::*, LADatum}; + use proptest::test_runner::{TestCaseError, TestCaseResult}; + use tract_data::internal::*; + + pub fn test_element_wise, T: LADatum, F: Fn(T) -> T>( + values: &[T], + reference: F, + ) -> TestCaseResult { + test_element_wise_params::(values, reference, ()) + } + + pub fn test_element_wise_params< + K: ElementWiseKer, + T: LADatum, + F: Fn(T) -> T, + Params, + >( + values: &[T], + reference: F, + params: Params, + ) -> TestCaseResult + where + Params: Copy + Send + Sync + Debug + 'static + Default, + { + crate::setup_test_logger(); + let op = ElementWiseImpl::::new(); + let mut values = values.to_vec(); + while values.len() < K::nr() { + values.push(T::zero()); + } + let expected = values.iter().copied().map(reference).collect::>(); + let mut found = values; + op.run_with_params(&mut found, params).unwrap(); + tensor1(&found) + .close_enough(&tensor1(&expected), true) + .map_err(|e| TestCaseError::fail(e.root_cause().to_string()))?; + Ok(()) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/element_wise_helper.rs b/vendor/tract-linalg-0.22.1/src/frame/element_wise_helper.rs new file mode 100644 index 000000000..f4b308464 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/element_wise_helper.rs @@ -0,0 +1,169 @@ +use crate::LADatum; +use std::alloc::*; +use tract_data::TractResult; + +pub(crate) fn map_slice_with_alignment( + vec: &mut [T], + f: impl Fn(&mut [T]), + nr: usize, + alignment_bytes: usize, +) -> TractResult<()> +where + T: LADatum, +{ + if vec.is_empty() { + return Ok(()); + } + unsafe { + TMP.with(|buffer| { + let mut buffer = buffer.borrow_mut(); + buffer.ensure(nr * T::datum_type().size_of(), alignment_bytes); + let tmp = std::slice::from_raw_parts_mut(buffer.buffer as *mut T, nr); + let mut compute_via_temp_buffer = |slice: &mut [T]| { + tmp[..slice.len()].copy_from_slice(slice); + f(tmp); + slice.copy_from_slice(&tmp[..slice.len()]) + }; + let prefix_len = vec.as_ptr().align_offset(alignment_bytes).min(vec.len()); + if prefix_len > 0 { + compute_via_temp_buffer(&mut vec[..prefix_len]); + } + let aligned_len = (vec.len() - prefix_len) / nr * nr; + if aligned_len > 0 { + f(&mut vec[prefix_len..][..aligned_len]); + } + if prefix_len + aligned_len < vec.len() { + compute_via_temp_buffer(&mut vec[prefix_len + aligned_len..]); + } + }) + } + Ok(()) +} + +pub(crate) fn reduce_slice_with_alignment( + vec: &[T], + f: impl Fn(&[T]) -> T, + nr: usize, + alignment_bytes: usize, + neutral: T, + reduce: impl Fn(T, T) -> T, +) -> TractResult +where + T: LADatum, +{ + if vec.is_empty() { + return Ok(neutral); + } + let mut red = neutral; + unsafe { + TMP.with(|buffer| { + let mut buffer = buffer.borrow_mut(); + buffer.ensure(nr * T::datum_type().size_of(), alignment_bytes); + let tmp = std::slice::from_raw_parts_mut(buffer.buffer as *mut T, nr); + let mut compute_via_temp_buffer = |slice: &[T], red: &mut T| { + tmp[..slice.len()].copy_from_slice(slice); + tmp[slice.len()..].fill(neutral); + *red = reduce(*red, f(tmp)); + }; + let prefix_len = vec.as_ptr().align_offset(alignment_bytes).min(vec.len()); + if prefix_len > 0 { + compute_via_temp_buffer(&vec[..prefix_len], &mut red); + } + let aligned_len = (vec.len() - prefix_len) / nr * nr; + if aligned_len > 0 { + let t = f(&vec[prefix_len..][..aligned_len]); + red = reduce(red, t); + } + if prefix_len + aligned_len < vec.len() { + compute_via_temp_buffer(&vec[prefix_len + aligned_len..], &mut red); + } + }) + } + Ok(red) +} + +pub(crate) fn map_reduce_slice_with_alignment( + vec: &mut [T], + f: impl Fn(&mut [T]) -> T, + nr: usize, + alignment_bytes: usize, + map_neutral: T, + neutral: T, + reduce: impl Fn(T, T) -> T, +) -> TractResult +where + T: LADatum, +{ + if vec.is_empty() { + return Ok(neutral); + } + let mut red = neutral; + unsafe { + TMP.with(|buffer| { + let mut buffer = buffer.borrow_mut(); + buffer.ensure(nr * T::datum_type().size_of(), alignment_bytes); + let tmp = std::slice::from_raw_parts_mut(buffer.buffer as *mut T, nr); + let mut compute_via_temp_buffer = |slice: &mut [T], red: &mut T| { + tmp[..slice.len()].copy_from_slice(slice); + tmp[slice.len()..].fill(map_neutral); + *red = reduce(*red, f(tmp)); + slice.copy_from_slice(&tmp[..slice.len()]); + }; + let prefix_len = vec.as_ptr().align_offset(alignment_bytes).min(vec.len()); + if prefix_len > 0 { + compute_via_temp_buffer(&mut vec[..prefix_len], &mut red); + } + let aligned_len = (vec.len() - prefix_len) / nr * nr; + if aligned_len > 0 { + let t = f(&mut vec[prefix_len..][..aligned_len]); + red = reduce(red, t); + } + if prefix_len + aligned_len < vec.len() { + compute_via_temp_buffer(&mut vec[prefix_len + aligned_len..], &mut red); + } + }) + } + Ok(red) +} + +std::thread_local! { + static TMP: std::cell::RefCell = std::cell::RefCell::new(TempBuffer::default()); +} + +pub struct TempBuffer { + pub layout: Layout, + pub buffer: *mut u8, +} + +impl Default for TempBuffer { + fn default() -> Self { + TempBuffer { layout: Layout::new::<()>(), buffer: std::ptr::null_mut() } + } +} + +impl TempBuffer { + pub fn ensure(&mut self, size: usize, alignment: usize) { + unsafe { + if size > self.layout.size() || alignment > self.layout.align() { + let size = size.max(self.layout.size()); + let alignment = alignment.max(self.layout.align()); + if !self.buffer.is_null() { + std::alloc::dealloc(self.buffer, self.layout); + } + self.layout = Layout::from_size_align_unchecked(size, alignment); + self.buffer = std::alloc::alloc(self.layout); + assert!(!self.buffer.is_null()); + } + } + } +} + +impl Drop for TempBuffer { + fn drop(&mut self) { + unsafe { + if !self.buffer.is_null() { + std::alloc::dealloc(self.buffer, self.layout); + } + } + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/leaky_relu.rs b/vendor/tract-linalg-0.22.1/src/frame/leaky_relu.rs new file mode 100644 index 000000000..8abf5b01f --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/leaky_relu.rs @@ -0,0 +1,65 @@ +#[allow(unused_macros)] +macro_rules! leaky_relu_impl { + ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $cond: expr) => { + ew_impl!($ti, $func, $nr, $alignment_items, $ti); + #[cfg(test)] + paste! { + mod [] { + use super::*; + leaky_relu_frame_tests!($cond, $ti, $func); + } + } + }; +} + +#[cfg(test)] +#[macro_use] +pub mod test { + use crate::{frame::element_wise::*, LADatum}; + use num_traits::{AsPrimitive, Float}; + use proptest::test_runner::TestCaseResult; + + #[macro_export] + macro_rules! leaky_relu_frame_tests { + ($cond:expr, $t: ty, $ker:ty) => { + proptest::proptest! { + #[test] + fn prop(xs in proptest::collection::vec(-25f32..25.0, 0..100), alpha in 0f32..1f32) { + if $cond { + $crate::frame::leaky_relu::test::test_leaky_relu::<$ker, $t>(&*xs, alpha).unwrap() + } + } + } + #[test] + fn trivial() { + if $cond { + $crate::frame::leaky_relu::test::test_leaky_relu::<$ker, $t>(&[-10f32], 0.0496).unwrap(); + } + } + }; + } + + pub fn test_leaky_relu, T: LADatum + Float>( + values: &[f32], + alpha: f32, + ) -> TestCaseResult + where + f32: AsPrimitive, + { + let data = tract_data::prelude::tensor1(values); + let data = data.cast_to::().unwrap(); + let data = data.as_slice::().unwrap(); + let alpha: T = tract_data::prelude::tensor0(alpha).cast_to_scalar::().unwrap(); + crate::frame::element_wise::test::test_element_wise_params::( + data, + |x: T| { + if x > T::zero() { + x + } else { + alpha * x + } + }, + alpha, + ) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/lut.rs b/vendor/tract-linalg-0.22.1/src/frame/lut.rs new file mode 100644 index 000000000..faaa43b49 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/lut.rs @@ -0,0 +1,141 @@ +use std::fmt; +use std::hash::Hash; +use std::marker::PhantomData; +use tract_data::internal::*; + +pub trait Lut: fmt::Debug + dyn_clone::DynClone + Send + Sync { + fn table(&self) -> &[u8]; + fn run(&self, buf: &mut [u8]); +} + +dyn_clone::clone_trait_object!(Lut); + +#[derive(Debug, Clone, Hash)] +pub struct LutImpl { + table: Tensor, + _boo: PhantomData, +} + +impl LutImpl { + pub fn new(table: &[u8]) -> LutImpl { + unsafe { + LutImpl { + table: Tensor::from_raw_aligned::( + &[table.len()], + table, + K::table_alignment_bytes(), + ) + .unwrap(), + _boo: PhantomData, + } + } + } +} + +impl Lut for LutImpl { + fn table(&self) -> &[u8] { + self.table.as_slice().unwrap() + } + + fn run(&self, buf: &mut [u8]) { + unsafe { + let table: *const u8 = self.table.as_ptr_unchecked(); + let align = K::input_alignment_bytes(); + let aligned_start = (buf.as_ptr() as usize).next_multiple_of(align); + let prefix = (aligned_start - buf.as_ptr() as usize).min(buf.len()); + for i in 0..(prefix as isize) { + let ptr = buf.as_mut_ptr().offset(i); + *ptr = *table.offset(*ptr as isize); + } + let remaining = buf.len() - prefix; + if remaining == 0 { + return; + } + let n = K::n(); + let aligned_len = remaining / n * n; + if aligned_len > 0 { + K::run(buf.as_mut_ptr().add(prefix), aligned_len, table); + } + let remaining = buf.len() - aligned_len - prefix; + for i in 0..remaining { + let ptr = buf.as_mut_ptr().add(i + prefix + aligned_len); + *ptr = *table.offset(*ptr as isize); + } + } + } +} + +pub trait LutKer: Clone + fmt::Debug + Send + Sync + Hash { + fn name() -> &'static str; + fn n() -> usize; + fn input_alignment_bytes() -> usize; + fn table_alignment_bytes() -> usize; + unsafe fn run(buf: *mut u8, len: usize, table: *const u8); +} + +#[cfg(test)] +#[macro_use] +pub mod test { + use super::*; + use proptest::prelude::*; + + #[derive(Debug)] + pub struct LutProblem { + pub table: Vec, + pub data: Vec, + } + + impl Arbitrary for LutProblem { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_p: ()) -> Self::Strategy { + proptest::collection::vec(any::(), 1..256) + .prop_flat_map(|table| { + let data = proptest::collection::vec(0..table.len() as u8, 0..100); + (Just(table), data) + }) + .prop_map(|(table, data)| LutProblem { table, data }) + .boxed() + } + } + + impl LutProblem { + pub fn reference(&self) -> Vec { + self.data.iter().map(|x| self.table[*x as usize]).collect() + } + + pub fn test(&self) -> Vec { + let lut = LutImpl::::new(&self.table); + let mut data = self.data.clone(); + lut.run(&mut data); + data + } + } + + #[macro_export] + macro_rules! lut_frame_tests { + ($cond:expr, $ker:ty) => { + mod lut { + use proptest::prelude::*; + #[allow(unused_imports)] + use $crate::frame::lut::test::*; + + proptest::proptest! { + #[test] + fn lut_prop(pb in any::()) { + if $cond { + prop_assert_eq!(pb.test::<$ker>(), pb.reference()) + } + } + } + + #[test] + fn test_empty() { + let pb = LutProblem { table: vec![0], data: vec![] }; + assert_eq!(pb.test::<$ker>(), pb.reference()) + } + } + }; + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/cost_model.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/cost_model.rs new file mode 100644 index 000000000..5f8cb46ba --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/cost_model.rs @@ -0,0 +1,86 @@ +use tract_data::internal::*; +use tract_data::itertools::{izip, Itertools}; + +use super::MatMatMul; + +fn order_f(&a: &F, &b: &F) -> std::cmp::Ordering { + if a < b { + std::cmp::Ordering::Less + } else { + std::cmp::Ordering::Greater + } +} + +#[derive(Debug)] +pub struct CostModel<'a> { + pub big_product_mkn_threshold: f32, + pub big_product_kernel_choice: &'a str, + pub kernels: &'a [&'a str], + pub mrs: &'a [u32], + pub nrs: &'a [u32], + pub feat_norm_mean: &'a [f32], + pub feat_norm_stddev: &'a [f32], + pub w1: &'a [f32], + pub b1: &'a [f32], + pub w2: &'a [f32], + pub b2: &'a [f32], +} + +impl CostModel<'_> { + pub fn features(&self, m: usize, k: usize, n: usize) -> Vec { + let mut feat = vec![ + (m as f32).ln(), + (k as f32).ln(), + (n as f32).ln(), + (n as f32 * m as f32 * k as f32).ln(), + ]; + for &mr in self.mrs { + let mr = mr as usize; + feat.push((m % mr) as f32); + feat.push((m % mr != 0) as usize as f32); + } + for &nr in self.nrs { + let nr = nr as usize; + feat.push((n % nr) as f32); + feat.push((n % nr != 0) as usize as f32); + } + feat + } + + fn normalize(&self, feat: &mut [f32]) { + izip!(feat, self.feat_norm_mean, self.feat_norm_stddev) + .for_each(|(x, m, s)| *x = (*x - m) / s) + } + + fn dnn(x: &[f32], w: &[f32], b: &[f32]) -> Vec { + let x = tract_ndarray::Array1::from_vec(x.to_vec()); + let w = tract_ndarray::Array2::from_shape_vec([b.len(), x.len()], w.to_vec()).unwrap(); + let b = tract_ndarray::Array1::from_vec(b.to_vec()); + (w.dot(&x) + b).to_vec() + } + + pub fn predict(&self, m: usize, k: usize, n: usize) -> &str { + let mut x = self.features(m, k, n); + self.normalize(&mut x); + let mut hidden = Self::dnn(&x, self.w1, self.b1); + (crate::generic().tanh_f32)().run(&mut hidden).unwrap(); + let output = Self::dnn(&hidden, self.w2, self.b2); + let ix = output.iter().copied().position_max_by(order_f).unwrap(); + self.kernels[ix] + } + + pub fn pick( + &self, + impls: &[Box], + m: Option, + k: Option, + n: Option, + ) -> Box { + if let (Some(m), Some(k), Some(n)) = (m, k, n) { + let choice = self.predict(m, k, n); + impls.iter().find(|k| k.name() == choice).unwrap().clone() + } else { + impls.iter().find(|k| k.name() == self.big_product_kernel_choice).unwrap().clone() + } + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/fuse.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/fuse.rs new file mode 100644 index 000000000..e077b26f7 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/fuse.rs @@ -0,0 +1,125 @@ +use std::fmt::Debug; +use std::ops::Deref; + +use crate::pack::PackedFormat; +use crate::BinOp; + +use super::{MMMInputValue, OutputStore, OutputStoreKer}; +use tract_data::internal::*; + +#[repr(usize)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum RoundingPolicy { + Native, + Zero, + Away, + MinusInf, + PlusInf, + Even, + Odd, +} + +#[derive(Clone, Debug)] +pub enum AsInputValue<'t> { + Owned(Box), + Borrowed(&'t dyn MMMInputValue), +} + +impl Deref for AsInputValue<'_> { + type Target = dyn MMMInputValue; + fn deref(&self) -> &Self::Target { + match self { + AsInputValue::Owned(b) => &**b, + AsInputValue::Borrowed(r) => *r, + } + } +} + +#[derive(Clone, Debug)] +pub enum FusedSpec<'t> { + BinScalar(&'t Tensor, BinOp), + BinPerRow(TensorView<'t>, BinOp), + BinPerCol(TensorView<'t>, BinOp), + AddRowColProducts(&'t Tensor, &'t Tensor), + AddUnicast(OutputStore), + LeakyRelu(&'t Tensor), + QScale(isize, RoundingPolicy, i32), + RoundingShiftRight(usize, RoundingPolicy), + ShiftLeft(usize), + Store(OutputStore), + AddMatMul { a: AsInputValue<'t>, b: AsInputValue<'t>, packing: usize }, +} + +impl FusedSpec<'_> { + pub fn prefer_col_outer(&self) -> Option { + if let FusedSpec::AddMatMul { a, b, .. } = self { + let a_is_eager = a.format().is::(); + let b_is_eager = b.format().is::(); + if a_is_eager == b_is_eager { + None + } else { + Some(a_is_eager) + } + } else { + None + } + } +} + +// Careful here, the jump_to comments are used by the build script. +#[repr(C, usize)] +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +#[rustfmt::skip] +pub enum FusedKerSpec { + Done, // jump_to:done + Clear, // jump_to:clear + // + LoadTile(*const TI, *const TI), // jump_to:load_tile + + ScalarMin(TI), // jump_to:scalar_min + ScalarMax(TI), // jump_to:scalar_max + ScalarAdd(TI), // jump_to:scalar_add + ScalarMul(TI), // jump_to:scalar_mul + ScalarSub(TI), // jump_to:scalar_sub + ScalarSubF(TI), // jump_to:scalar_sub_flipped + + LeakyRelu(TI), // jump_to:leaky_relu + + PerRowMin(*const TI), // jump_to:per_row_min + PerRowMax(*const TI), // jump_to:per_row_max + PerRowAdd(*const TI), // jump_to:per_row_add + PerRowMul(*const TI), // jump_to:per_row_mul + PerRowSub(*const TI), // jump_to:per_row_sub + PerRowSubF(*const TI), // jump_to:per_row_sub_flipped + + PerColMin(*const TI), // jump_to:per_col_min + PerColMax(*const TI), // jump_to:per_col_max + PerColAdd(*const TI), // jump_to:per_col_add + PerColMul(*const TI), // jump_to:per_col_mul + PerColSub(*const TI), // jump_to:per_col_sub + PerColSubF(*const TI), // jump_to:per_col_sub_flipped + + QScale(isize, RoundingPolicy, i32), // jump_to:q_scale + RoundingShiftRight(usize, RoundingPolicy), // jump_to:q_shr + ShiftLeft(usize), // jump_to:q_shl + AddUnicast(OutputStoreKer), // jump_to:add_unicast + AddRowColProducts(*const TI, *const TI), // jump_to:add_row_col_products + Store(OutputStoreKer), // jump_to:store + + // jump_to:add_mat_mul + AddMatMul { k: usize, pa: *const u8, pb: *const u8, packing: usize }, +} + +unsafe impl Send for FusedKerSpec {} +unsafe impl Sync for FusedKerSpec {} + +#[cfg(test)] +#[test] +fn check_non_linear_enum_size() { + assert_eq!(std::mem::size_of::(), std::mem::size_of::()); + assert_eq!( + std::mem::size_of::>(), + std::mem::size_of::() + std::mem::size_of::() + ); + assert_eq!(std::mem::size_of::>(), 5 * std::mem::size_of::()); +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/input_store.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/input_store.rs new file mode 100644 index 000000000..0337578ca --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/input_store.rs @@ -0,0 +1,179 @@ +use downcast_rs::{impl_downcast, Downcast}; +use dyn_clone::DynClone; +use dyn_hash::DynHash; +use std::alloc::Layout; +use std::fmt::{Debug, Display}; +use std::hash::Hash; +use std::sync::Arc; +use tract_data::internal::*; + +use crate::WeightType; + +pub trait MMMInputFormat: Downcast + Debug + DynHash + DynClone + Send + Sync + Display { + fn prepare_tensor(&self, t: &Tensor, k_axis: usize, mn_axis: usize) -> TractResult; + fn prepare_one( + &self, + t: &Tensor, + k_axis: usize, + mn_axis: usize, + ) -> TractResult>; + fn precursor(&self) -> WeightType; + fn r(&self) -> usize; + fn k_alignment(&self) -> usize; + fn same_as(&self, other: &dyn MMMInputFormat) -> bool; + fn merge_with<'o, 'a: 'o, 'b: 'o>( + &'a self, + other: &'b dyn MMMInputFormat, + ) -> Option<&'o dyn MMMInputFormat> { + if self.same_as(other) { + Some(other) + } else { + None + } + } + fn mem_size(&self, k: TDim, mn: TDim) -> TDim; + fn extract_at_mn_f16( + &self, + data: &EagerPackedInput, + mn: usize, + slice: &mut [f16], + ) -> TractResult<()>; + fn extract_at_mn_f32( + &self, + data: &EagerPackedInput, + mn: usize, + slice: &mut [f32], + ) -> TractResult<()>; +} + +dyn_clone::clone_trait_object!(MMMInputFormat); +impl_downcast!(MMMInputFormat); +dyn_hash::hash_trait_object!(MMMInputFormat); + +impl Eq for &dyn MMMInputFormat {} +impl PartialEq for &dyn MMMInputFormat { + fn eq(&self, other: &Self) -> bool { + self.same_as(*other) + } +} + +pub trait MMMInputValue: DynClone + Debug + DynHash + Send + Sync + Display + Downcast { + fn format(&self) -> &dyn MMMInputFormat; + fn scratch_panel_buffer_layout(&self) -> Option; + fn panel_bytes(&self, i: usize, buffer: Option<*mut u8>) -> TractResult<*const u8>; + fn panels_count(&self) -> usize { + self.mn().divceil(self.format().r()) + } + fn mn(&self) -> usize; + fn k(&self) -> usize; + fn opaque_fact(&self) -> &dyn OpaqueFact; + fn same_as(&self, other: &dyn MMMInputValue) -> bool; + + fn extract_at_mn_f16(&self, mn: usize, slice: &mut [f16]) -> TractResult<()>; + fn extract_at_mn_f32(&self, mn: usize, slice: &mut [f32]) -> TractResult<()>; +} +dyn_clone::clone_trait_object!(MMMInputValue); +impl_downcast!(MMMInputValue); +dyn_hash::hash_trait_object!(MMMInputValue); + +impl From> for Opaque { + fn from(value: Box) -> Self { + Opaque(Arc::new(value)) + } +} + +impl OpaquePayload for Box { + fn same_as(&self, other: &dyn OpaquePayload) -> bool { + other + .downcast_ref::() + .is_some_and(|other| (&**self as &dyn MMMInputValue).same_as(&**other)) + } +} + +#[allow(clippy::derived_hash_with_manual_eq)] +#[derive(Clone, Hash, Debug)] +pub struct PackedOpaqueFact { + pub format: Box, + pub mn: TDim, + pub k: usize, +} + +impl Display for PackedOpaqueFact { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Eager {} tensor (mn={} k={})", self.format, self.mn, self.k) + } +} + +impl OpaqueFact for PackedOpaqueFact { + fn mem_size(&self) -> TDim { + self.format.mem_size(self.k.to_dim(), self.mn.clone()) + } + + fn same_as(&self, other: &dyn OpaqueFact) -> bool { + other.downcast_ref::().is_some_and(|o| o == self) + } +} + +impl PartialEq for PackedOpaqueFact { + fn eq(&self, other: &Self) -> bool { + self.format.same_as(&*other.format) && self.mn == other.mn && self.k == other.k + } +} + +#[derive(Clone, Hash)] +pub struct EagerPackedInput { + pub fact: PackedOpaqueFact, + pub packed: Arc, + pub panel_bytes: usize, + pub mn: usize, +} + +impl MMMInputValue for EagerPackedInput { + fn scratch_panel_buffer_layout(&self) -> Option { + None + } + fn panel_bytes(&self, i: usize, _buffer: Option<*mut u8>) -> TractResult<*const u8> { + unsafe { Ok(self.packed.as_ptr().add(i * self.panel_bytes)) } + } + fn k(&self) -> usize { + self.fact.k + } + fn mn(&self) -> usize { + self.mn + } + fn format(&self) -> &dyn MMMInputFormat { + &*self.fact.format + } + fn opaque_fact(&self) -> &dyn OpaqueFact { + &self.fact + } + fn same_as(&self, other: &dyn MMMInputValue) -> bool { + other.downcast_ref::().is_some_and(|other| { + self.fact.same_as(&other.fact) + && self.packed == other.packed + && self.panel_bytes == other.panel_bytes + }) + } + fn extract_at_mn_f16(&self, mn: usize, slice: &mut [f16]) -> TractResult<()> { + ensure!(slice.len() == self.k()); + ensure!(mn < self.mn()); + self.fact.format.extract_at_mn_f16(self, mn, slice) + } + fn extract_at_mn_f32(&self, mn: usize, slice: &mut [f32]) -> TractResult<()> { + ensure!(slice.len() == self.k()); + ensure!(mn < self.mn()); + self.fact.format.extract_at_mn_f32(self, mn, slice) + } +} + +impl Display for EagerPackedInput { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + (&self.fact as &dyn Display).fmt(f) + } +} + +impl Debug for EagerPackedInput { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + ::fmt(self, f) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/kernel.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/kernel.rs new file mode 100644 index 000000000..8283b5d96 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/kernel.rs @@ -0,0 +1,159 @@ +use crate::frame::pack::PackedFormat; + +use super::*; +use std::borrow::Cow; +use std::fmt::Debug; + +use crate::LADatum; + +pub trait MatMatMulKer: Clone + Debug + Send + Sync + 'static { + type Acc: LADatum; + fn name(&self) -> &str; + fn kernel(&self, op: &[FusedKerSpec]) -> isize; + fn mr(&self) -> usize; + fn nr(&self) -> usize; + + fn quality(&self) -> ImplementationQuality; + fn dynamic_boost(&self) -> isize; + + #[allow(clippy::type_complexity)] + fn packings(&self) -> &[(Box, Box)]; + fn stores(&self) -> Cow<'_, [DatumType]>; + + #[allow(unused_variables)] + fn can_fuse(&self, spec: &FusedSpec) -> bool { + true + } + + #[allow(unused_variables)] + fn is_supported_here(&self) -> bool { + true + } +} + +type Kernel = unsafe fn(&[FusedKerSpec]) -> isize; + +#[derive(Clone)] +pub struct DynKernel { + pub name: String, + pub kernel: Kernel, + pub quality: ImplementationQuality, + pub packings: Vec<(Box, Box)>, + pub stores: Vec, + pub supported_predicate: fn() -> bool, + pub boost: fn() -> isize, + pub can_fuse: fn(&FusedSpec) -> bool, +} + +impl DynKernel { + pub fn new( + name: &str, + kernel: Kernel, + packing_a: PackedFormat, + packing_b: PackedFormat, + quality: ImplementationQuality, + ) -> Self { + let kernel = DynKernel { + name: name.to_string(), + kernel, + quality, + packings: vec![], + stores: vec![Acc::datum_type()], + supported_predicate: || true, + boost: || 0, + can_fuse: |_| true, + }; + kernel.with_packing(packing_a, packing_b) + } + + pub fn with_platform_condition(mut self, f: fn() -> bool) -> Self { + self.supported_predicate = f; + self + } + + pub fn with_boost(mut self, f: fn() -> isize) -> Self { + self.boost = f; + self + } + + pub fn with_packing(mut self, a: impl MMMInputFormat, b: impl MMMInputFormat) -> Self { + self.packings.push((Box::new(a), Box::new(b))); + self + } + + pub fn with_packing_a(self, a: impl MMMInputFormat) -> Self { + let b = self.regular_pack_b(); + self.with_packing(a, b) + } + + pub fn regular_pack_a(&self) -> PackedFormat { + *self.packings[0].0.clone().downcast::().unwrap() + } + + pub fn regular_pack_b(&self) -> PackedFormat { + *self.packings[0].1.clone().downcast::().unwrap() + } + + pub fn with_can_fuse(self, can_fuse: fn(&FusedSpec) -> bool) -> Self { + Self { can_fuse, ..self } + } + + pub fn with_store(mut self) -> Self { + self.stores.push(D::datum_type()); + self + } + + pub fn mmm(&self) -> Box { + Box::new(self.clone()) + } +} + +impl Debug for DynKernel { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.name) + } +} + +impl MatMatMulKer for DynKernel { + type Acc = Acc; + fn name(&self) -> &str { + &self.name + } + + fn mr(&self) -> usize { + MR + } + + fn nr(&self) -> usize { + NR + } + + fn quality(&self) -> ImplementationQuality { + self.quality + } + + fn is_supported_here(&self) -> bool { + (self.supported_predicate)() + } + + fn can_fuse(&self, spec: &FusedSpec) -> bool { + (self.can_fuse)(spec) + } + + fn kernel(&self, op: &[FusedKerSpec]) -> isize { + unsafe { (self.kernel)(op) } + } + + #[allow(clippy::type_complexity)] + fn packings(&self) -> &[(Box, Box)] { + &self.packings + } + + fn stores(&self) -> Cow<'_, [DatumType]> { + Cow::Borrowed(&self.stores) + } + + fn dynamic_boost(&self) -> isize { + (self.boost)() + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/macros.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/macros.rs new file mode 100644 index 000000000..d500c1c3b --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/macros.rs @@ -0,0 +1,124 @@ +macro_rules! MMMExternKernel { + ( + $func:ident<$ti:ident>($mr: expr, $nr: expr) + $(@($align_a:expr, $align_b:expr))? + $(where($where:expr))? + $(can_fuse($can_fuse:expr))? + $(packing[$pnum:literal] = $pid:ident => $packing:expr;)* + $(quality($quality:expr))? + $(boost($boost:expr))? + $(store($($store:ty),*))? + ) => { + paste! { + mod [] { + #[allow(unused_imports)] + use super::*; + #[allow(unused_imports)] + use crate::frame::mmm::*; + extern_kernel!(fn $func(op: *const FusedKerSpec<$ti>) -> isize); + + #[inline] + pub unsafe fn rusty(op: &[FusedKerSpec<$ti>]) -> isize { + unsafe { $func(op.as_ptr()) } + } + } + + MMMKernel!([]::rusty as $func<$ti>($mr, $nr) + $(@($align_a, $align_b))? + $(where($where))? + $(can_fuse($can_fuse))? + $(packing[$pnum] = $pid => $packing;)* + $(quality($quality))? + $(boost($boost))? + $(store($($store),*))? + ); + } + }; +} +macro_rules! MMMRustKernel { + ( $func: path => + $id:ident<$ti:ident>($mr: expr, $nr: expr) + $(@($align_a:expr, $align_b:expr))? + $(where($where:expr))? + $(can_fuse($can_fuse:expr))? + $(packing[$pnum:literal] = $pid:ident => $packing:expr;)* + $(quality($quality:expr))? + $(store($($store:ty),*))? + ) => { + paste! { + mod [] { + #[allow(unused_imports)] + use crate::frame::mmm::*; + use super::*; + #[inline] + pub unsafe fn rusty(op: &[FusedKerSpec<$ti>]) -> isize { + unsafe { $func(op.as_ptr()) } + } + } + MMMKernel!([]::rusty as $id<$ti>($mr, $nr) + $(@($align_a, $align_b))? + generic(true) + $(where($where))? + $(can_fuse($can_fuse))? + $(packing[$pnum] = $pid => $packing;)* + $(quality($quality))? + $(store($($store),*))? + ); + } + } +} + +macro_rules! MMMKernel { + ( + $func: path as + $id:ident<$ti:ident>($mr: expr, $nr: expr) + $(@($align_a:expr, $align_b:expr))? + $(generic($generic:expr))? + $(where($where:expr))? + $(can_fuse($can_fuse:expr))? + $(packing[$pnum:literal] = $pid:ident => $packing:expr;)* + $(quality($quality:expr))? + $(boost($boost:expr))? + $(store($($store:ty),*))? + ) => { + paste! { + lazy_static::lazy_static! { + pub static ref $id: $crate::mmm::DynKernel<$mr, $nr, $ti> = { + use $crate::mmm::DynKernel; + #[allow(unused_imports)] + use tract_data::prelude::*; + use $crate::pack::Packing; + #[allow(unused_mut)] + let (mut packing_a, mut packing_b) = ($ti::packing($mr), $ti::packing($nr)); + $( + packing_a = packing_a.align($align_a); + packing_b = packing_b.align($align_b); + )? + #[allow(unused_mut)] + let mut k = DynKernel::<$mr, $nr, $ti>::new(stringify!($id), $func, packing_a, packing_b, $crate::frame::mmm::ImplementationQuality::Dreadful); + $(k = k.with_platform_condition($where);)? + $( + assert!(k.packings.len() == $pnum); + let f: fn(DynKernel<$mr, $nr, $ti>) -> DynKernel<$mr, $nr, $ti> = $packing; + k = f(k); + )* + $($( + k.stores.push(<$store>::datum_type()); + )*)? + $(k.can_fuse = $can_fuse;)? + $(k.quality = $quality;)? + $(k = k.with_boost($boost);)? + k + }; + } + + #[cfg(test)] + mod [] { + use super::$id; + test_mmm_kernel!($ti, &*super::$id); + $(mmm_packed_packed_tests!(&*super::$id, $pid : $pnum);)* + $($(mmm_store_test!(&*super::$id, $store);)*)? + } + } + }; +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/mod.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/mod.rs new file mode 100644 index 000000000..3200d293c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/mod.rs @@ -0,0 +1,307 @@ +#[macro_use] +mod macros; + +pub mod cost_model; +#[macro_use] +pub(crate) mod fuse; +pub(crate) mod input_store; +pub(crate) mod kernel; +#[macro_use] +pub(crate) mod panel_extract; +mod scratch; +mod storage; + +#[cfg(test)] +#[macro_use] +pub mod tests; + +use crate::multithread::Executor; +#[cfg(feature = "multithread-mm")] +use rayon::prelude::*; +use std::borrow::Cow; +use std::cmp::Ordering; +use std::fmt::Debug; +use tract_data::internal::*; + +pub use cost_model::*; +pub use fuse::*; +pub use input_store::*; +pub use kernel::*; +pub use panel_extract::*; +pub use scratch::*; +pub use storage::*; + +pub fn no_prefetch(_ptr: *const u8, _len: usize) {} + +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] +pub enum ImplementationQuality { + /// Individual operations are emulated by individual conversion (f16->f32->f16) + Dreadful, + /// Rust scalar operation (with whatever optimisation the compiler manages) + Generic, + /// Implicit vectorization (e.g. Rust code, some unrolled loops, explicit template instantiations for small constant) + RustOptimized, + /// Explicit vectorization (e.g. intrinsics vector code) + TargetOptimized, + /// Hand optimized (assembly) + ManuallyOptimized, +} + +impl ImplementationQuality { + pub fn best_to_worst() -> &'static [ImplementationQuality] { + use ImplementationQuality::*; + &[ManuallyOptimized, TargetOptimized, RustOptimized, Generic, Dreadful] + } + + pub fn cost(&self) -> usize { + ImplementationQuality::best_to_worst().iter().position(|x| x == self).unwrap() + } +} + +impl PartialOrd for ImplementationQuality { + fn partial_cmp(&self, other: &Self) -> Option { + Some(usize::from(*self).cmp(&usize::from(*other))) + } +} + +impl From for usize { + fn from(value: ImplementationQuality) -> Self { + value.cost() + } +} + +pub trait MatMatMul: Debug + dyn_clone::DynClone + Send + Sync + std::any::Any { + fn name(&self) -> &str; + fn mr(&self) -> usize; + fn nr(&self) -> usize; + + fn quality(&self) -> ImplementationQuality; + fn dynamic_boost(&self) -> isize; + + #[allow(clippy::type_complexity)] + fn packings(&self) -> &[(Box, Box)]; + + fn internal_type(&self) -> DatumType; + + unsafe fn c_view(&self, m_axis: Option, n_axis: Option) -> OutputStoreSpec; + unsafe fn c_from_data_and_strides( + &self, + item_size: usize, + row_stride: isize, + col_stride: isize, + ) -> OutputStoreSpec; + + fn can_fuse(&self, spec: &FusedSpec) -> bool; + + fn stores(&self) -> Cow<'_, [DatumType]>; + + unsafe fn run(&self, m: usize, n: usize, non_linear: &[FusedSpec]) -> TractResult<()> { + unsafe { + let mut scratch = self.allocate_scratch_space(); + self.run_with_scratch_space(m, n, &mut *scratch, non_linear) + } + } + + unsafe fn allocate_scratch_space(&self) -> Box; + unsafe fn can_use_scratch_space(&self, scratch: &dyn ScratchSpace) -> bool; + unsafe fn run_with_scratch_space( + &self, + m: usize, + n: usize, + scratch: &mut dyn ScratchSpace, + non_linear: &[FusedSpec], + ) -> TractResult<()>; +} + +dyn_clone::clone_trait_object!(MatMatMul); + +impl PartialEq for Box { + fn eq(&self, other: &Box) -> bool { + self.name() == other.name() + } +} + +impl std::hash::Hash for Box { + fn hash(&self, state: &mut H) { + self.name().hash(state) + } +} + +impl MatMatMul for K { + fn name(&self) -> &str { + self.name() + } + fn mr(&self) -> usize { + self.mr() + } + fn nr(&self) -> usize { + self.nr() + } + + fn quality(&self) -> ImplementationQuality { + MatMatMulKer::quality(self) + } + + fn dynamic_boost(&self) -> isize { + MatMatMulKer::dynamic_boost(self) + } + + fn packings(&self) -> &[(Box, Box)] { + self.packings() + } + + fn internal_type(&self) -> DatumType { + K::Acc::datum_type() + } + + fn can_fuse(&self, spec: &FusedSpec) -> bool { + self.can_fuse(spec) + } + + unsafe fn c_view(&self, m_axis: Option, n_axis: Option) -> OutputStoreSpec { + OutputStoreSpec::View { m_axis, n_axis, mr: self.mr(), nr: self.nr() } + } + + unsafe fn c_from_data_and_strides( + &self, + item_size: usize, + row_stride: isize, + col_stride: isize, + ) -> OutputStoreSpec { + OutputStoreSpec::Strides { + row_byte_stride: row_stride * item_size as isize, + col_byte_stride: col_stride * item_size as isize, + mr: self.mr(), + nr: self.nr(), + } + } + + fn stores(&self) -> Cow<'_, [DatumType]> { + self.stores() + } + + unsafe fn allocate_scratch_space(&self) -> Box { + Box::>::default() + } + + unsafe fn can_use_scratch_space(&self, scratch: &dyn ScratchSpace) -> bool { + scratch.downcast_ref::>().is_some() + } + + unsafe fn run_with_scratch_space( + &self, + m: usize, + n: usize, + scratch: &mut dyn ScratchSpace, + non_linear: &[FusedSpec], + ) -> TractResult<()> { + unsafe { + let scratch = scratch + .downcast_mut::>() + .context("Wrong scratch space type")?; + scratch.prepare(self, m, n, non_linear)?; + if n == 1 && self.nr() == 1 { + run_with_scratch_space_vec(self, m, scratch, non_linear) + } else { + let (mut prefer_col, mut prefer_row) = (0, 0); + for uop in non_linear.iter() { + if let Some(col) = uop.prefer_col_outer() { + prefer_col = col as usize; + prefer_row = (!col) as usize; + } + } + if prefer_col > prefer_row { + run_with_scratch_space_col_outer(self, m, n, scratch, non_linear) + } else { + run_with_scratch_space_row_outer(self, m, n, scratch, non_linear) + } + } + } + } +} + +unsafe fn run_with_scratch_space_vec( + ker: &K, + m: usize, + scratch: &mut ScratchSpaceImpl, + non_linear: &[FusedSpec], +) -> TractResult<()> { + unsafe { + match crate::multithread::current_tract_executor() { + Executor::SingleThread => { + for ia in 0..m.divceil(ker.mr()) { + scratch.run(ker, non_linear, ia, 0)?; + } + Ok(()) + } + #[cfg(feature = "multithread-mm")] + Executor::MultiThread(pool) => pool.install(|| { + (0..m.div_ceil(ker.mr())) + .into_par_iter() + .try_for_each(|ia| scratch.run(ker, non_linear, ia, 0)) + }), + } + } +} + +unsafe fn run_with_scratch_space_col_outer( + ker: &K, + m: usize, + n: usize, + scratch: &mut ScratchSpaceImpl, + non_linear: &[FusedSpec], +) -> TractResult<()> { + unsafe { + match crate::multithread::current_tract_executor() { + Executor::SingleThread => { + for ib in 0..n.divceil(ker.nr()) { + for ia in 0..m.divceil(ker.mr()) { + scratch.run(ker, non_linear, ia, ib)?; + } + } + Ok(()) + } + #[cfg(feature = "multithread-mm")] + Executor::MultiThread(pool) => pool.install(|| { + (0..n.div_ceil(ker.nr())).into_par_iter().try_for_each(|ib| { + for ia in 0..m.divceil(ker.mr()) { + scratch.run(ker, non_linear, ia, ib)?; + } + Ok(()) + }) + }), + } + } +} + +unsafe fn run_with_scratch_space_row_outer( + ker: &K, + m: usize, + n: usize, + scratch: &mut ScratchSpaceImpl, + non_linear: &[FusedSpec], +) -> TractResult<()> { + unsafe { + match crate::multithread::current_tract_executor() { + Executor::SingleThread => { + for ia in 0..m.divceil(ker.mr()) { + for ib in 0..n.divceil(ker.nr()) { + scratch.run(ker, non_linear, ia, ib)?; + } + } + Ok(()) + } + #[cfg(feature = "multithread-mm")] + Executor::MultiThread(pool) => pool.install(|| { + pool.install(|| { + (0..m.div_ceil(ker.mr())).into_par_iter().try_for_each(|ia| { + for ib in 0..n.divceil(ker.nr()) { + scratch.run(ker, non_linear, ia, ib)?; + } + Ok(()) + }) + }) + }), + } + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/panel_extract.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/panel_extract.rs new file mode 100644 index 000000000..31862ab48 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/panel_extract.rs @@ -0,0 +1,300 @@ +use std::fmt::{Debug, Display}; +use tract_data::internal::*; + +use super::{EagerPackedInput, MMMInputFormat, MMMInputValue}; +use crate::pack::PackedFormat; + +type Kernel = unsafe fn(input: *const u8, output: *mut u8, k: usize); + +#[allow(clippy::derived_hash_with_manual_eq)] +#[derive(Hash, Clone)] +pub struct PanelExtractor { + pub name: String, + pub from: Box, + pub to: PackedFormat, + pub kernel: Kernel, + pub supported_predicate: fn() -> bool, +} + +impl Debug for PanelExtractor { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} ({:?} -> {:?})", self.name, self.from, self.to) + } +} + +impl Display for PanelExtractor { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.name) + } +} + +impl PartialEq for PanelExtractor { + fn eq(&self, other: &Self) -> bool { + self.name == other.name && self.from.same_as(&*other.from) && self.to == other.to + } +} + +impl PanelExtractor { + #[allow(unused_variables)] + pub fn is_supported_here(&self) -> bool { + (self.supported_predicate)() + } +} + +#[derive(Clone, Hash)] +pub struct PanelExtractInput { + pub format: PanelExtractor, + pub data: EagerPackedInput, +} + +impl MMMInputValue for PanelExtractInput { + fn scratch_panel_buffer_layout(&self) -> Option { + Some(self.format.to.single_panel_layout(self.data.k(), self.format.to.dt.size_of())) + } + fn panel_bytes(&self, i: usize, buffer: Option<*mut u8>) -> TractResult<*const u8> { + let scratch = buffer.unwrap(); + unsafe { + let source = self.data.packed.as_ptr().add(self.data.panel_bytes * i); + (self.format.kernel)(source, scratch, self.data.k()); + } + Ok(scratch) + } + fn mn(&self) -> usize { + self.data.mn() + } + fn k(&self) -> usize { + self.data.k() + } + fn format(&self) -> &dyn MMMInputFormat { + &self.format.to + } + fn opaque_fact(&self) -> &dyn OpaqueFact { + self.data.opaque_fact() + } + fn same_as(&self, other: &dyn MMMInputValue) -> bool { + other + .downcast_ref::() + .is_some_and(|o| o.format == self.format && o.data.same_as(&self.data)) + } + fn extract_at_mn_f16(&self, mn: usize, slice: &mut [f16]) -> TractResult<()> { + self.data.extract_at_mn_f16(mn, slice) + } + fn extract_at_mn_f32(&self, mn: usize, slice: &mut [f32]) -> TractResult<()> { + self.data.extract_at_mn_f32(mn, slice) + } +} + +impl Display for PanelExtractInput { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "PanelExtract({})", self.data) + } +} + +impl Debug for PanelExtractInput { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "PanelExtract({})", self.data) + } +} + +#[macro_export] +macro_rules! panel_extractor { + ( $func:path as $id:ident($from:expr, $to: expr) + $(where($where:expr))? + ) => { + paste! { + lazy_static::lazy_static! { + pub static ref $id: $crate::mmm::PanelExtractor = { + use $crate::mmm::MMMInputFormat; + let (from, to) = ($from, $to); + assert!(from.r() == to.r()); + #[allow(unused_mut)] + let mut it = $crate::mmm::PanelExtractor { + name: stringify!($id).to_string(), + from, + to, + kernel: $func, + supported_predicate: || true + }; + $( + it.supported_predicate = $where; + )? + it + }; + } + + #[cfg(test)] + mod [] { + use super::$id; + #[test] + fn repack_0block_1panel() { + $crate::frame::mmm::panel_extract::test::test_packing(&$id, 0, 1).unwrap(); + } + + #[test] + fn repack_1block_0panel() { + $crate::frame::mmm::panel_extract::test::test_packing(&$id, 1, 0).unwrap(); + } + + #[test] + fn repack_1block_1panel() { + $crate::frame::mmm::panel_extract::test::test_packing(&$id, 1, 1).unwrap(); + } + + #[test] + fn repack_2block_1panel() { + $crate::frame::mmm::panel_extract::test::test_packing(&$id, 2, 1).unwrap(); + } + + #[test] + fn repack_1block_2panel() { + $crate::frame::mmm::panel_extract::test::test_packing(&$id, 1, 2).unwrap(); + } + + #[test] + fn repack_2block_2panel() { + $crate::frame::mmm::panel_extract::test::test_packing(&$id, 2, 2).unwrap(); + } + } + } + }; +} + +#[cfg(test)] +pub mod test { + use crate::frame::block_quant::PackedBlockQuantFormat; + use tract_data::internal::*; + use tract_ndarray::Array2; + + use super::*; + + pub fn test_packing( + extractor: &PanelExtractor, + blocks: usize, + panels: usize, + ) -> TractResult<()> { + if !extractor.is_supported_here() { + return Ok(()); + } + assert!(extractor.from.r() == extractor.to.r()); + assert!(extractor.to.dt == f32::datum_type() || extractor.to.dt == f16::datum_type()); + if let Some(from) = extractor.from.downcast_ref::() { + test_packing_bq(extractor, from, blocks, panels) + } else if let Some(from) = extractor.from.downcast_ref() { + test_packing_plain(extractor, from, blocks, panels) + } else { + todo!() + } + } + + pub fn test_packing_plain( + extractor: &PanelExtractor, + from: &PackedFormat, + blocks: usize, + panels: usize, + ) -> TractResult<()> { + let m = from.r * panels; + let k = 8 * blocks; // 8 is arbitrary + let to = &extractor.to; + let weights_orig = + Array2::from_shape_fn((m, k), |(m, k)| ((m * 31 + k * 17) % 20) as f32 - 10.) + .into_tensor() + .cast_to_dt(from.dt)? + .into_owned(); + let packed_orig = from.prepare_tensor(&weights_orig, 1, 0)?; + let packed_orig = + packed_orig.to_scalar::()?.downcast_ref::>().unwrap(); + let packed_orig = packed_orig.downcast_ref::().unwrap(); + + for panel in 0..panels { + let orig_panel = &packed_orig.packed[packed_orig.panel_bytes * panel..] + [..k * from.r * from.dt.size_of()]; + let mut reference_panel = Tensor::zero_dt(from.dt, &[k, from.r])?; + reference_panel.as_bytes_mut().copy_from_slice(orig_panel); + reference_panel = reference_panel.cast_to_dt(to.dt)?.into_owned(); + + let mut tested_panel = Tensor::zero_dt(to.dt, &[k, from.r])?; + unsafe { + (extractor.kernel)( + orig_panel.as_ptr(), + tested_panel.as_bytes_mut().as_mut_ptr(), + k, + ); + } + compare_panels(&tested_panel, &reference_panel, from.r, k); + } + Ok(()) + } + + pub fn test_packing_bq( + extractor: &PanelExtractor, + from: &PackedBlockQuantFormat, + blocks: usize, + panels: usize, + ) -> TractResult<()> { + let m = from.r * panels; + let k = from.bq.block_len() * blocks; + let to = &extractor.to; + let weights_orig = + Array2::from_shape_fn((m, k), |(m, k)| ((m * 31 + k * 17) % 20) as f32 - 10.) + .into_tensor() + .cast_to_dt(to.dt)? + .into_owned(); + let weights = if to.dt == f32::datum_type() { + from.bq + .dequant_f32(&from.bq.quant_f32(weights_orig.as_slice::()?)?)? + .into_shape(&[m, k])? + } else { + from.bq + .dequant_f16(&from.bq.quant_f16(weights_orig.as_slice::()?)?)? + .into_shape(&[m, k])? + }; + let block_quant = if to.dt == f32::datum_type() { + from.bq.quant_f32(weights.as_slice::()?)? + } else { + from.bq.quant_f16(weights.as_slice::()?)? + }; + let packed_block_quant = + from.bq.pack(&block_quant, k, from.r, from.zip, from.scales_at_end)?; + + let mut reference_panel = Tensor::zero_dt(to.dt, &[k, from.r])?; + let mut tested_panel = Tensor::zero_dt(to.dt, &[k, from.r])?; + + for panel in 0..packed_block_quant.panels_count() { + unsafe { + from.bq.extract_packed_panel( + &packed_block_quant, + to, + panel, + reference_panel.as_bytes_mut().as_mut_ptr(), + )?; + + let source = + packed_block_quant.packed.as_ptr().add(packed_block_quant.panel_bytes * panel); + (extractor.kernel)(source, tested_panel.as_bytes_mut().as_mut_ptr(), k); + } + compare_panels(&tested_panel, &reference_panel, from.r, k); + } + Ok(()) + } + + fn compare_panels(tested_panel: &Tensor, reference_panel: &Tensor, r: usize, k: usize) { + if tested_panel != reference_panel { + if reference_panel.datum_type() == f32::datum_type() { + crate::frame::mmm::tests::display_error( + tested_panel.as_slice::().unwrap(), + reference_panel.as_slice::().unwrap(), + r, + k, + ); + } else { + crate::frame::mmm::tests::display_error( + tested_panel.as_slice::().unwrap(), + reference_panel.as_slice::().unwrap(), + r, + k, + ); + } + } + assert_eq!(tested_panel, reference_panel); + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/scratch.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/scratch.rs new file mode 100644 index 000000000..aec2e265a --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/scratch.rs @@ -0,0 +1,529 @@ +use super::{FusedKerSpec, FusedSpec, MatMatMulKer, OutputStoreKer}; +use crate::{BinOp, LADatum}; +use downcast_rs::{impl_downcast, Downcast}; +use std::cell::RefCell; +use std::fmt::Debug; +use std::sync::atomic::AtomicUsize; +use tract_data::internal::num_integer::Integer; +use tract_data::internal::*; + +static GENERATION: AtomicUsize = AtomicUsize::new(1); + +thread_local! { + static TLS: RefCell = Default::default(); +} + +#[derive(Default, Debug)] +struct TLSScratch { + generation: usize, + blob: Blob, + ker_specs_16: Vec>, + ker_specs_32: Vec>, + ker_specs_64: Vec>, +} + +impl TLSScratch { + #[allow(unknown_lints, clippy::missing_transmute_annotations)] + fn ker_specs(&mut self) -> &mut Vec> { + unsafe { + if TI::datum_type() == f32::datum_type() || TI::datum_type() == i32::datum_type() { + std::mem::transmute(&mut self.ker_specs_32) + } else if TI::datum_type() == f16::datum_type() { + std::mem::transmute(&mut self.ker_specs_16) + } else if TI::datum_type() == f64::datum_type() { + std::mem::transmute(&mut self.ker_specs_64) + } else { + todo!(); + } + } + } + + fn sync(&mut self, scratch: &ScratchSpaceImpl) { + if self.generation == scratch.generation { + return; + } + let ker_specs = self.ker_specs::(); + ker_specs.clear(); + ker_specs.extend_from_slice(&scratch.ker_specs); + + unsafe { + self.blob.ensure_size_and_align(scratch.blob_size, scratch.blob_align); + + for LocDependant { loc, ker_spec, .. } in &scratch.loc_dependant { + #[allow(clippy::single_match)] + if matches!(scratch.ker_specs[*ker_spec], FusedKerSpec::AddMatMul { .. }) { + let scratch = &mut *(self.blob.as_ptr().add(*loc) as *mut AddMatMulTemp); + scratch.panel_a_id = usize::MAX; + scratch.panel_b_id = usize::MAX; + }; + } + } + self.generation = scratch.generation; + } +} + +pub trait ScratchSpace: Downcast + Send {} +impl_downcast!(ScratchSpace); + +#[derive(Debug, Default)] +pub struct ScratchSpaceImpl { + generation: usize, + blob_size: usize, + blob_align: usize, + ker_specs: Vec>, + loc_dependant: TVec, + valid_down_tiles: usize, + remnant_down: usize, + valid_right_tiles: usize, + remnant_right: usize, +} + +#[derive(Debug, new)] +struct LocDependant { + spec: usize, + ker_spec: usize, + // offset for the location dependant structure + loc: usize, + // offset of its associated dynamic-size buffers + buffer_a: Option, + buffer_b: Option, +} + +impl ScratchSpace for ScratchSpaceImpl {} +unsafe impl Send for ScratchSpaceImpl {} + +#[derive(Debug)] +struct AddMatMulTemp { + ptr_a: *const u8, + panel_a_id: usize, + ptr_b: *const u8, + panel_b_id: usize, +} + +impl ScratchSpaceImpl { + pub unsafe fn prepare( + &mut self, + ker: &impl MatMatMulKer, + m: usize, + n: usize, + specs: &[FusedSpec], + ) -> TractResult<()> { + use FusedKerSpec as FKS; + use FusedSpec as FS; + self.ker_specs.clear(); + self.loc_dependant.clear(); + self.ker_specs.reserve(specs.len() + 2); + self.ker_specs.push(FusedKerSpec::Clear); + self.valid_down_tiles = m / ker.mr(); + self.remnant_down = m % ker.mr(); + self.valid_right_tiles = n / ker.nr(); + self.remnant_right = n % ker.nr(); + let mut offset = 0; + let mut align = std::mem::size_of::<*const ()>(); + fn ld(spec: usize, uspec: usize, loc: usize) -> LocDependant { + LocDependant { spec, ker_spec: uspec, loc, buffer_a: None, buffer_b: None } + } + for (ix, spec) in specs.iter().enumerate() { + offset = offset.next_multiple_of(&align); + let ker_spec = match spec { + FS::BinScalar(t, op) => match op { + BinOp::Min => FKS::ScalarMin(*t.to_scalar()?), + BinOp::Max => FKS::ScalarMax(*t.to_scalar()?), + BinOp::Mul => FKS::ScalarMul(*t.to_scalar()?), + BinOp::Add => FKS::ScalarAdd(*t.to_scalar()?), + BinOp::Sub => FKS::ScalarSub(*t.to_scalar()?), + BinOp::SubF => FKS::ScalarSubF(*t.to_scalar()?), + }, + FS::ShiftLeft(s) => FKS::ShiftLeft(*s), + FS::RoundingShiftRight(s, rp) => FKS::RoundingShiftRight(*s, *rp), + FS::QScale(s, rp, m) => FKS::QScale(*s, *rp, *m), + FS::BinPerRow(_, _) => { + self.loc_dependant.push(ld(ix, self.ker_specs.len(), offset)); + offset += TI::datum_type().size_of() * ker.mr(); + FusedKerSpec::Done + } + FS::BinPerCol(_, _) => { + self.loc_dependant.push(ld(ix, self.ker_specs.len(), offset)); + offset += TI::datum_type().size_of() * ker.nr(); + FusedKerSpec::Done + } + FS::AddRowColProducts(_, _) => { + self.loc_dependant.push(ld(ix, self.ker_specs.len(), offset)); + offset += TI::datum_type().size_of() * (ker.mr() + ker.nr()); + FusedKerSpec::Done + } + FS::AddUnicast(_) => { + self.loc_dependant.push(ld(ix, self.ker_specs.len(), offset)); + offset += TI::datum_type().size_of() * ker.mr() * ker.nr(); + FusedKerSpec::Done + } + FS::Store(store) => { + self.loc_dependant.push(ld(ix, self.ker_specs.len(), offset)); + offset += store.item_size * ker.mr() * ker.nr(); + FusedKerSpec::Done + } + FS::LeakyRelu(t) => FKS::LeakyRelu(*t.to_scalar()?), + FS::AddMatMul { a, b, packing } => { + let mut ld = ld(ix, self.ker_specs.len(), offset); + offset += std::mem::size_of::(); + if let Some(tmp) = a.scratch_panel_buffer_layout() { + align = tmp.align().lcm(&align); + offset = Integer::next_multiple_of(&offset, &tmp.align()); + ld.buffer_a = Some(offset); + offset += tmp.size(); + } + if let Some(tmp) = b.scratch_panel_buffer_layout() { + align = tmp.align().lcm(&align); + offset = Integer::next_multiple_of(&offset, &tmp.align()); + ld.buffer_b = Some(offset); + offset += tmp.size(); + } + self.loc_dependant.push(ld); + FusedKerSpec::AddMatMul { + k: 0, + pa: std::ptr::null(), + pb: std::ptr::null(), + packing: *packing, + } + } + }; + self.ker_specs.push(ker_spec); + } + self.ker_specs.push(FKS::Done); + self.blob_size = offset; + self.blob_align = align; + + self.generation = GENERATION.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + Ok(()) + } + + pub unsafe fn run( + &self, + ker: &impl MatMatMulKer, + specs: &[FusedSpec], + down: usize, + right: usize, + ) -> TractResult<()> { + unsafe { + TLS.with_borrow_mut(|tls| { + tls.sync(self); + if down < self.valid_down_tiles && right < self.valid_right_tiles { + self.for_valid_tile(ker, specs, tls, down, right)?; + let err = ker.kernel(tls.ker_specs()); + debug_assert_eq!(err, 0, "Kernel return error {err}"); + } else { + let remnant_down = + if down < self.valid_down_tiles { ker.mr() } else { self.remnant_down }; + let remnant_right = + if right < self.valid_right_tiles { ker.nr() } else { self.remnant_right }; + self.for_border_tile( + ker, + specs, + tls, + down, + right, + remnant_down, + remnant_right, + )?; + let err = ker.kernel(tls.ker_specs()); + debug_assert_eq!(err, 0, "Kernel return error {err}"); + self.postprocess_tile(specs, tls, down, right, remnant_down, remnant_right)?; + } + Ok(()) + }) + } + } + + #[inline(always)] + unsafe fn for_valid_tile( + &self, + ker: &impl MatMatMulKer, + specs: &[FusedSpec], + tls: &mut TLSScratch, + down: usize, + right: usize, + ) -> TractResult<()> { + unsafe { + use FusedKerSpec as FKS; + use FusedSpec as FS; + let ScratchSpaceImpl { ker_specs, loc_dependant, .. } = self; + debug_assert!(specs.len() + 2 == ker_specs.len()); + for LocDependant { spec, ker_spec, loc, buffer_a, buffer_b } in loc_dependant { + let spec = specs.get_unchecked(*spec); + let it = match spec { + FS::BinPerRow(v, op) => { + let v = v.as_ptr_unchecked::().add(down * ker.mr()); + match op { + BinOp::Min => FKS::PerRowMin(v), + BinOp::Max => FKS::PerRowMax(v), + BinOp::Add => FKS::PerRowAdd(v), + BinOp::Mul => FKS::PerRowMul(v), + BinOp::Sub => FKS::PerRowSub(v), + BinOp::SubF => FKS::PerRowSubF(v), + } + } + FS::BinPerCol(v, op) => { + let v = v.as_ptr_unchecked::().add(right * ker.nr()); + match op { + BinOp::Min => FKS::PerColMin(v), + BinOp::Max => FKS::PerColMax(v), + BinOp::Add => FKS::PerColAdd(v), + BinOp::Mul => FKS::PerColMul(v), + BinOp::Sub => FKS::PerColSub(v), + BinOp::SubF => FKS::PerColSubF(v), + } + } + FS::AddRowColProducts(rows, cols) => { + let row_ptr = rows.as_ptr_unchecked::().add(down * ker.mr()); + let col_ptr = cols.as_ptr_unchecked::().add(right * ker.nr()); + FKS::AddRowColProducts(row_ptr, col_ptr) + } + FS::AddUnicast(store) => FKS::AddUnicast(store.tile_c(down, right)), + FS::Store(c_store) => FKS::Store(c_store.tile_c(down, right)), + FS::AddMatMul { a, b, packing } => { + let scratch = (tls.blob.as_mut_ptr().add(*loc) as *mut AddMatMulTemp) + .as_mut() + .unwrap(); + if scratch.panel_a_id != down { + scratch.ptr_a = a.panel_bytes( + down, + buffer_a.map(|o| tls.blob.as_mut_ptr().add(o)), + )?; + scratch.panel_a_id = down; + } + if scratch.panel_b_id != right { + scratch.ptr_b = b.panel_bytes( + right, + buffer_b.map(|o| tls.blob.as_mut_ptr().add(o)), + )?; + scratch.panel_b_id = right; + } + FKS::AddMatMul { + k: b.k(), + pa: scratch.ptr_a, + pb: scratch.ptr_b, + packing: *packing, + } + } + _ => std::hint::unreachable_unchecked(), + }; + *tls.ker_specs().get_unchecked_mut(*ker_spec) = it; + } + Ok(()) + } + } + + #[inline(never)] + #[allow(clippy::too_many_arguments)] + unsafe fn for_border_tile( + &self, + ker: &impl MatMatMulKer, + specs: &[FusedSpec], + tls: &mut TLSScratch, + down: usize, + right: usize, + m_remnant: usize, + n_remnant: usize, + ) -> TractResult<()> { + unsafe { + use FusedKerSpec as FKS; + use FusedSpec as FS; + for LocDependant { spec, ker_spec: uspec, loc, buffer_a, buffer_b } in + &self.loc_dependant + { + let loc = tls.blob.as_mut_ptr().add(*loc); + let spec = specs.get_unchecked(*spec); + let it = match spec { + FS::BinPerRow(v, op) => { + let buf = std::slice::from_raw_parts_mut(loc as *mut TI, ker.mr()); + let ptr = if m_remnant < ker.mr() { + if m_remnant > 0 { + buf.get_unchecked_mut(..m_remnant).copy_from_slice( + v.as_slice_unchecked() + .get_unchecked(down * ker.mr()..) + .get_unchecked(..m_remnant), + ); + } + if cfg!(debug_assertions) { + buf.get_unchecked_mut(m_remnant..) + .iter_mut() + .for_each(|x| *x = TI::zero()); + } + buf.as_ptr() + } else { + v.as_ptr_unchecked::().add(down * ker.mr()) + }; + match op { + BinOp::Min => FKS::PerRowMin(ptr), + BinOp::Max => FKS::PerRowMax(ptr), + BinOp::Add => FKS::PerRowAdd(ptr), + BinOp::Mul => FKS::PerRowMul(ptr), + BinOp::Sub => FKS::PerRowSub(ptr), + BinOp::SubF => FKS::PerRowSubF(ptr), + } + } + FS::BinPerCol(v, op) => { + let buf = std::slice::from_raw_parts_mut(loc as *mut TI, ker.nr()); + let ptr = if n_remnant < ker.nr() { + if n_remnant > 0 { + buf.get_unchecked_mut(..n_remnant).copy_from_slice( + v.as_slice_unchecked() + .get_unchecked(right * ker.nr()..) + .get_unchecked(..n_remnant), + ); + } + if cfg!(debug_assertions) { + buf.get_unchecked_mut(n_remnant..) + .iter_mut() + .for_each(|x| *x = TI::zero()); + } + buf.as_ptr() + } else { + v.as_ptr_unchecked::().add(right * ker.nr()) + }; + match op { + BinOp::Min => FKS::PerColMin(ptr), + BinOp::Max => FKS::PerColMax(ptr), + BinOp::Add => FKS::PerColAdd(ptr), + BinOp::Mul => FKS::PerColMul(ptr), + BinOp::Sub => FKS::PerColSub(ptr), + BinOp::SubF => FKS::PerColSubF(ptr), + } + } + FS::AddRowColProducts(rows, cols) => { + let r = std::slice::from_raw_parts_mut(loc as *mut TI, ker.mr()); + let row_ptr = if m_remnant < ker.mr() { + r.get_unchecked_mut(..m_remnant).copy_from_slice( + rows.as_slice_unchecked() + .get_unchecked(down * ker.mr()..) + .get_unchecked(..m_remnant), + ); + if cfg!(debug_assertions) { + r.get_unchecked_mut(m_remnant..) + .iter_mut() + .for_each(|x| *x = TI::zero()); + } + r.as_ptr() + } else { + rows.as_ptr_unchecked::().add(down * ker.mr()) + }; + let c = std::slice::from_raw_parts_mut( + (loc as *mut TI).add(ker.mr()), + ker.nr(), + ); + let col_ptr = if n_remnant < ker.nr() { + c.get_unchecked_mut(..n_remnant).copy_from_slice( + cols.as_slice_unchecked() + .get_unchecked(right * ker.nr()..) + .get_unchecked(..n_remnant), + ); + if cfg!(debug_assertions) { + r.get_unchecked_mut(n_remnant..) + .iter_mut() + .for_each(|x| *x = TI::zero()); + } + c.as_ptr() + } else { + cols.as_ptr_unchecked::().add(right * ker.nr()) + }; + FKS::AddRowColProducts(row_ptr, col_ptr) + } + FS::AddUnicast(store) => { + let row_byte_stride = store.row_byte_stride; + let col_byte_stride = store.col_byte_stride; + let tile_offset = row_byte_stride * down as isize * ker.mr() as isize + + col_byte_stride * right as isize * ker.nr() as isize; + let tile_ptr = store.ptr.offset(tile_offset); + let tmp_d_tile = + std::slice::from_raw_parts_mut(loc as *mut TI, ker.mr() * ker.nr()); + if cfg!(debug_assertions) { + tmp_d_tile.iter_mut().for_each(|t| *t = TI::zero()); + } + for r in 0..m_remnant as isize { + for c in 0..n_remnant as isize { + let inner_offset = c * col_byte_stride + r * row_byte_stride; + if inner_offset + tile_offset + < (store.item_size * store.item_count) as isize + { + *tmp_d_tile + .get_unchecked_mut(r as usize + c as usize * ker.mr()) = + *(tile_ptr.offset(inner_offset) as *const TI); + } + } + } + FKS::AddUnicast(OutputStoreKer { + ptr: tmp_d_tile.as_ptr() as _, + row_byte_stride: std::mem::size_of::() as isize, + col_byte_stride: (std::mem::size_of::() * ker.mr()) as isize, + item_size: std::mem::size_of::(), + }) + } + FS::Store(c_store) => { + let tmpc = OutputStoreKer { + ptr: loc as _, + item_size: c_store.item_size, + row_byte_stride: c_store.item_size as isize, + col_byte_stride: (c_store.item_size * ker.mr()) as isize, + }; + FKS::Store(tmpc) + } + FS::AddMatMul { a, b, packing } => { + let scratch = (loc as *mut AddMatMulTemp).as_mut().unwrap(); + if scratch.panel_a_id != down { + scratch.ptr_a = a.panel_bytes( + down, + buffer_a.map(|o| tls.blob.as_mut_ptr().add(o)), + )?; + scratch.panel_a_id = down; + } + if scratch.panel_b_id != right { + scratch.ptr_b = b.panel_bytes( + right, + buffer_b.map(|o| tls.blob.as_mut_ptr().add(o)), + )?; + scratch.panel_b_id = right; + } + FKS::AddMatMul { + k: b.k(), + pa: scratch.ptr_a, + pb: scratch.ptr_b, + packing: *packing, + } + } + _ => std::hint::unreachable_unchecked(), + }; + *tls.ker_specs().get_unchecked_mut(*uspec) = it; + } + Ok(()) + } + } + + #[inline] + pub fn uspecs(&self) -> &[FusedKerSpec] { + &self.ker_specs + } + + unsafe fn postprocess_tile( + &self, + specs: &[FusedSpec], + tls: &mut TLSScratch, + down: usize, + right: usize, + m_remnant: usize, + n_remnant: usize, + ) -> TractResult<()> + where + TI: LADatum, + { + unsafe { + for LocDependant { spec, ker_spec: uspec, .. } in self.loc_dependant.iter() { + let spec = specs.get_unchecked(*spec); + let ker_spec = tls.ker_specs::().get_unchecked(*uspec); + if let (FusedSpec::Store(c_store), FusedKerSpec::Store(tmp)) = (spec, ker_spec) { + c_store.set_from_tile(down, right, m_remnant, n_remnant, tmp) + } + } + Ok(()) + } + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/storage.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/storage.rs new file mode 100644 index 000000000..959128cff --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/storage.rs @@ -0,0 +1,139 @@ +use std::fmt::Debug; +use tract_data::internal::*; + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +pub enum OutputStoreSpec { + View { m_axis: Option, n_axis: Option, mr: usize, nr: usize }, + Strides { row_byte_stride: isize, col_byte_stride: isize, mr: usize, nr: usize }, +} + +#[derive(Clone, Copy, Debug)] +pub struct OutputStore { + pub(crate) ptr: *mut u8, + pub(crate) row_byte_stride: isize, + pub(crate) col_byte_stride: isize, + pub(crate) panel_row_byte_stride: isize, + pub(crate) panel_col_byte_stride: isize, + pub(crate) item_size: usize, + pub(crate) item_count: usize, + pub(crate) mr: usize, +} + +unsafe impl Send for OutputStore {} +unsafe impl Sync for OutputStore {} + +impl OutputStoreSpec { + #[inline] + pub unsafe fn wrap(&self, tensor: &TensorView) -> OutputStore { + let (mr, nr, row_byte_stride, col_byte_stride) = unsafe { self.compute_strides(tensor) }; + OutputStore { + ptr: unsafe { tensor.as_ptr_unchecked::() } as _, + row_byte_stride, + col_byte_stride, + panel_row_byte_stride: row_byte_stride * mr as isize, + panel_col_byte_stride: col_byte_stride * nr as isize, + item_size: tensor.datum_type().size_of(), + mr, + item_count: tensor.len(), + } + } + + #[inline] + unsafe fn compute_strides(&self, tensor: &TensorView) -> (usize, usize, isize, isize) { + let size_of = tensor.datum_type().size_of() as isize; + match self { + OutputStoreSpec::View { m_axis, n_axis, mr, nr, .. } => { + let tensor_strides = tensor.strides(); + let row_item_stride = + m_axis.map(|ax| *unsafe { tensor_strides.get_unchecked(ax) }).unwrap_or(0); + let col_item_stride = + n_axis.map(|ax| *unsafe { tensor_strides.get_unchecked(ax) }).unwrap_or(0); + let row_byte_stride = row_item_stride * size_of; + let col_byte_stride = col_item_stride * size_of; + (*mr, *nr, row_byte_stride, col_byte_stride) + } + OutputStoreSpec::Strides { row_byte_stride, col_byte_stride, mr, nr, .. } => { + (*mr, *nr, *row_byte_stride, *col_byte_stride) + } + } + } +} + +impl OutputStore { + #[inline] + pub(super) unsafe fn tile_c(&self, down: usize, right: usize) -> OutputStoreKer { + unsafe { + let (down, right) = (down as isize, right as isize); + OutputStoreKer { + ptr: self + .ptr + .offset(self.panel_row_byte_stride * down + self.panel_col_byte_stride * right) + as *mut _, + row_byte_stride: self.row_byte_stride, + col_byte_stride: self.col_byte_stride, + item_size: self.item_size, + } + } + } + + #[inline] + pub fn item_size(&self) -> usize { + self.item_size + } + + #[inline] + pub(super) unsafe fn set_from_tile( + &self, + down: usize, + right: usize, + height: usize, + width: usize, + tile: &OutputStoreKer, + ) { + unsafe { + if self.item_size() == 1 { + self.set_from_tile_t::(down, right, height, width, tile) + } else if self.item_size() == 2 { + self.set_from_tile_t::(down, right, height, width, tile) + } else if self.item_size() == 4 { + self.set_from_tile_t::(down, right, height, width, tile) + } else { + self.set_from_tile_t::(down, right, height, width, tile) + } + } + } + + #[inline] + unsafe fn set_from_tile_t( + &self, + down: usize, + right: usize, + height: usize, + width: usize, + tile: &OutputStoreKer, + ) { + unsafe { + let tile = tile.ptr as *mut T; + let dst = self.ptr.add( + self.panel_row_byte_stride as usize * down + + self.panel_col_byte_stride as usize * right, + ); + for y in 0..height as isize { + for x in 0..width as isize { + let value = tile.offset(y + x * self.mr as isize); + let dst = dst.offset(y * self.row_byte_stride + x * self.col_byte_stride); + *(dst as *mut T) = *value; + } + } + } + } +} + +#[repr(C)] +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +pub struct OutputStoreKer { + pub ptr: *mut u8, + pub row_byte_stride: isize, + pub col_byte_stride: isize, + pub item_size: usize, +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/frame.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/frame.rs new file mode 100644 index 000000000..384eab9e3 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/frame.rs @@ -0,0 +1,295 @@ +use crate::frame::mmm::*; +use crate::{BinOp, LADatum}; +use num_traits::AsPrimitive; +use std::ops::Neg; +use tests::display_error; +use tract_data::internal::*; + +#[macro_export] +macro_rules! mmm_frame_tests { + ($ker:expr, $ta:ty, $tb:ty, $tc:ty, $ti:ty) => { + mod frame { + use tract_data::internal::*; + #[allow(unused_imports)] + use $crate::frame::mmm::tests::frame::*; + + #[test] + fn row_mul_2_1_3() -> TractResult<()> { + unsafe { row_mul::<_, $ta, $tb, $tc, $ti>($ker, 2, 3)? } + Ok(()) + } + + #[test] + fn row_add_2_1_3() -> TractResult<()> { + unsafe { row_add::<_, $ta, $tb, $tc, $ti>($ker, 2, 3)? } + Ok(()) + } + + #[test] + fn col_mul_2_1_3() -> TractResult<()> { + unsafe { col_mul::<_, $ta, $tb, $tc, $ti>($ker, 2, 3)? } + Ok(()) + } + + #[test] + fn col_add_2_1_3() -> TractResult<()> { + unsafe { col_add::<_, $ta, $tb, $tc, $ti>($ker, 2, 3)? } + Ok(()) + } + + #[test] + fn max_2_1_3() -> TractResult<()> { + unsafe { max::<_, $ta, $tb, $tc, $ti>($ker, 2, 3)? } + Ok(()) + } + + #[test] + fn min_2_1_3() -> TractResult<()> { + unsafe { min::<_, $ta, $tb, $tc, $ti>($ker, 2, 3)? } + Ok(()) + } + + #[test] + fn add_d_2_1_3() -> TractResult<()> { + unsafe { add_d::<_, $ta, $tb, $tc, $ti>($ker, 2, 3)? } + Ok(()) + } + + #[test] + fn add_d_big() -> TractResult<()> { + unsafe { add_d::<_, $ta, $tb, $tc, $ti>($ker, 197, 1)? } + Ok(()) + } + } + }; +} + +pub unsafe fn fused_ops< + K: MatMatMulKer + 'static, + TA, + TB, + TC, + TI, + F: Fn(usize, usize) -> TC, +>( + ker: &K, + m: usize, + n: usize, + spec: &[FusedSpec], + expect: F, +) -> TractResult<()> +where + TA: LADatum + AsPrimitive + 'static, + TB: LADatum + AsPrimitive + 'static, + TC: LADatum + AsPrimitive + 'static, + TI: LADatum + AsPrimitive + 'static, + i32: AsPrimitive, + usize: AsPrimitive, +{ + if !ker.is_supported_here() { + return Ok(()); + }; + crate::setup_test_logger(); + + let mut found = Tensor::zero::(&[m, n])?; + let c_store = unsafe { + ker.c_from_data_and_strides(TC::datum_type().size_of(), n as isize, 1) + .wrap(&found.view_mut()) + }; + let mut spec: TVec = spec.into(); + spec.push(FusedSpec::Store(c_store)); + + unsafe { ker.run(m, n, &spec) }?; + let expected = + tract_ndarray::prelude::Array2::from_shape_fn((m, n), |(r, c)| expect(r, c)).into_tensor(); + let err = found.close_enough(&expected, true); + if err.is_err() { + display_error(found.as_slice::()?, expected.as_slice::()?, m, n); + } + err +} + +pub unsafe fn row_add + 'static, TA, TB, TC, TI>( + ker: &K, + m: usize, + n: usize, +) -> TractResult<()> +where + TA: LADatum + AsPrimitive + 'static, + TB: LADatum + AsPrimitive + 'static, + TC: LADatum + AsPrimitive + 'static, + TI: LADatum + AsPrimitive + 'static + Neg, + i32: AsPrimitive, + usize: AsPrimitive, +{ + let bias = (0..m).map(|i| i.as_()).collect::>(); + unsafe { + fused_ops::( + ker, + m, + n, + &[FusedSpec::BinPerRow(tensor1(&bias).view(), BinOp::Add)], + |r, _| bias[r].as_(), + ) + } +} + +pub unsafe fn row_mul + 'static, TA, TB, TC, TI>( + ker: &K, + m: usize, + n: usize, +) -> TractResult<()> +where + TA: LADatum + AsPrimitive + 'static, + TB: LADatum + AsPrimitive + 'static, + TC: LADatum + AsPrimitive + 'static, + TI: LADatum + AsPrimitive + 'static + Neg, + i32: AsPrimitive, + usize: AsPrimitive, +{ + let bias = (0..m).map(|i| i.as_()).collect::>(); + unsafe { + fused_ops::( + ker, + m, + n, + &[ + FusedSpec::BinScalar(&tensor0(1i32.as_()), BinOp::Add), + FusedSpec::BinPerRow(tensor1(&bias).view(), BinOp::Mul), + ], + |r, _| bias[r].as_(), + ) + } +} + +pub unsafe fn col_add + 'static, TA, TB, TC, TI>( + ker: &K, + m: usize, + n: usize, +) -> TractResult<()> +where + TA: LADatum + AsPrimitive + 'static, + TB: LADatum + AsPrimitive + 'static, + TC: LADatum + AsPrimitive + 'static, + TI: LADatum + AsPrimitive + 'static + Neg, + i32: AsPrimitive, + usize: AsPrimitive, +{ + let bias = (0..n).map(|i| i.as_()).collect::>(); + unsafe { + fused_ops::( + ker, + m, + n, + &[FusedSpec::BinPerCol(tensor1(&bias).view(), BinOp::Add)], + |_, c| bias[c].as_(), + ) + } +} + +pub unsafe fn col_mul + 'static, TA, TB, TC, TI>( + ker: &K, + m: usize, + n: usize, +) -> TractResult<()> +where + TA: LADatum + AsPrimitive + 'static, + TB: LADatum + AsPrimitive + 'static, + TC: LADatum + AsPrimitive + 'static, + TI: LADatum + AsPrimitive + 'static + Neg, + i32: AsPrimitive, + usize: AsPrimitive, +{ + let bias = (0..n).map(|i| i.as_()).collect::>(); + unsafe { + fused_ops::( + ker, + m, + n, + &[ + FusedSpec::BinScalar(&tensor0(1i32.as_()), BinOp::Add), + FusedSpec::BinPerCol(tensor1(&bias).view(), BinOp::Mul), + ], + |_, c| bias[c].as_(), + ) + } +} + +pub unsafe fn add_d + 'static, TA, TB, TC, TI>( + ker: &K, + m: usize, + n: usize, +) -> TractResult<()> +where + TA: LADatum + AsPrimitive + 'static, + TB: LADatum + AsPrimitive + 'static, + TC: LADatum + AsPrimitive + 'static, + TI: LADatum + AsPrimitive + 'static + Neg, + i32: AsPrimitive, + usize: AsPrimitive, +{ + let d = (0..m * n).map(|i| i.as_()).collect::>(); + let d = tensor1(&d).into_shape(&[m, n])?; + let store_spec = + OutputStoreSpec::View { m_axis: Some(0), n_axis: Some(1), mr: ker.mr(), nr: ker.nr() }; + let view_d = d.to_array_view::()?.into_dimensionality()?; + unsafe { + fused_ops::( + ker, + m, + n, + &[FusedSpec::AddUnicast(store_spec.wrap(&d.view()))], + |r, c| view_d[(r, c)].as_(), + ) + } +} + +pub unsafe fn max, TA, TB, TC, TI>( + ker: &K, + m: usize, + n: usize, +) -> TractResult<()> +where + TA: LADatum + AsPrimitive + 'static, + TB: LADatum + AsPrimitive + 'static, + TC: LADatum + AsPrimitive + 'static, + TI: LADatum + AsPrimitive + 'static + Neg, + i32: AsPrimitive, + usize: AsPrimitive, +{ + let five: TI = 5.as_(); + unsafe { + fused_ops::( + ker, + m, + n, + &[FusedSpec::BinScalar(&tensor0(five), BinOp::Max)], + |_, _| five.as_(), + ) + } +} + +pub unsafe fn min, TA, TB, TC, TI>( + ker: &K, + m: usize, + n: usize, +) -> TractResult<()> +where + TA: LADatum + AsPrimitive + 'static, + TB: LADatum + AsPrimitive + 'static, + TC: LADatum + AsPrimitive + 'static, + TI: LADatum + AsPrimitive + 'static + Neg, + i32: AsPrimitive, + usize: AsPrimitive, +{ + let five: TI = 5.as_(); + unsafe { + fused_ops::( + ker, + m, + n, + &[FusedSpec::BinScalar(&tensor0(five), BinOp::Min)], + |_, _| TC::zero(), + ) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/fuse.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/fuse.rs new file mode 100644 index 000000000..da909a80b --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/fuse.rs @@ -0,0 +1,287 @@ +use crate::frame::mmm::fuse::FusedKerSpec; +use crate::frame::mmm::storage::*; +use crate::frame::mmm::tests::display_error; +use crate::frame::mmm::tests::store::mmm_stride_storage; +use crate::frame::mmm::*; +use num_traits::{AsPrimitive, Bounded}; +use proptest::prelude::*; +use tract_data::internal::*; + +#[macro_export] +macro_rules! mmm_kernel_fuse_tests { + ($ker:expr, $tc:ty, $ti: ty) => { + mod fuse { + use num_traits::Zero; + #[allow(unused_imports)] + use tract_data::prelude::f16; + use tract_data::prelude::tensor0; + use $crate::frame::mmm::tests::fuse as test; + #[allow(unused_imports)] + use $crate::frame::mmm::tests::fuse::*; + use $crate::frame::mmm::MatMatMulKer; + + #[test] + fn return_zeros() { + test::return_zeros::<_, $tc, $ti>($ker) + } + + #[test] + fn store_non_contiguous() { + test::store_non_contiguous::<_, $tc, $ti>($ker) + } + proptest::proptest! { + #[test] + fn return_c_prop(c in tile::<_, $ti>($ker)) { + test::return_c::<_, $ti>($ker, &c) + } + } + + fn fmin(a: T, b: T) -> T { + if a < b { + a + } else { + b + } + } + + fn fmax(a: T, b: T) -> T { + if a > b { + a + } else { + b + } + } + + macro_rules! bin { + ($FKS:ident, $geo:expr, $f:expr, $extra_cond:expr) => { + paste! { + #[test] + fn [<$FKS:snake>]() { + if ($ker).is_supported_here() && $extra_cond { + test::$geo::<_, $ti>($ker, $crate::mmm::FusedKerSpec::$FKS, $f); + } + } + } + }; + } + + bin!(PerColMin, per_col, fmin, true); + bin!(PerColMax, per_col, fmax, true); + bin!(PerColAdd, per_col, |a, b| a + b, true); + bin!(PerColMul, per_col, |a, b| a * b, true); + bin!(PerColSub, per_col, |a, b| a - b, true); + bin!(PerColSubF, per_col, |a, b| b - a, true); + + bin!(PerRowMin, per_row, fmin, true); + bin!(PerRowMax, per_row, fmax, true); + bin!(PerRowAdd, per_row, |a, b| a + b, true); + bin!(PerRowMul, per_row, |a, b| a * b, true); + bin!(PerRowSub, per_row, |a, b| a - b, true); + bin!(PerRowSubF, per_row, |a, b| b - a, true); + + bin!(ScalarMin, scalar, fmin, true); + bin!(ScalarMax, scalar, fmax, true); + bin!(ScalarAdd, scalar, |a, b| a + b, true); + bin!(ScalarMul, scalar, |a, b| a * b, true); + bin!(ScalarSub, scalar, |a, b| a - b, true); + bin!(ScalarSubF, scalar, |a, b| b - a, true); + + bin!( + LeakyRelu, + scalar, + |a, b| if b > <$ti>::zero() { b } else { a * b }, + ($ker).can_fuse(&$crate::mmm::FusedSpec::LeakyRelu(&tensor0(<$ti>::from(1_u8)))) + ); + + #[test] + fn return_c_add_row_col_product() { + test::return_c_add_row_col_product::<_, $ti>($ker) + } + + #[test] + fn return_c_plus_d() { + test::return_c_plus_d::<_, $ti, $ti>($ker) + } + + #[test] + fn return_c_clear() { + test::return_c_clear::<_, $ti>($ker) + } + } + }; +} + +use crate::LADatum; +pub fn return_zeros(ker: &K) +where + K: MatMatMulKer, + TC: LADatum, + TI: LADatum + Bounded + PartialEq, +{ + if !ker.is_supported_here() { + return; + } + let v = vec![TC::max_value(); ker.mr() * ker.nr()]; + let c = mmm_stride_storage(&v, ker.nr()); + let non_linear = tvec![FusedKerSpec::Clear, FusedKerSpec::Store(c), FusedKerSpec::Done]; + let err = ker.kernel(&non_linear); + assert_eq!(err, 0); + let expected = vec![TC::zero(); v.len()]; + display_error(&v, &expected, ker.mr(), ker.nr()); + assert_eq!(v, expected); +} + +pub fn store_non_contiguous(ker: &K) +where + K: MatMatMulKer, + TC: LADatum, + TI: LADatum + Bounded + PartialEq, +{ + if !ker.is_supported_here() { + return; + } + let v = vec![TC::max_value(); ker.mr() * 5 * ker.nr() * 3]; + let c = OutputStoreKer { + ptr: v.as_ptr() as _, + row_byte_stride: (std::mem::size_of::() * 3 * ker.nr() * 5) as isize, + col_byte_stride: std::mem::size_of::() as isize * 3, + item_size: std::mem::size_of::(), + }; + let non_linear = tvec![FusedKerSpec::Clear, FusedKerSpec::Store(c), FusedKerSpec::Done]; + let err = ker.kernel(&non_linear); + assert_eq!(err, 0); + let mut expected = vec![TC::max_value(); v.len()]; + for c in 0..ker.nr() { + for r in 0..ker.mr() { + expected[c * 3 + r * 3 * 5 * ker.nr()] = TC::zero(); + } + } + assert_eq!(v, expected); +} + +pub fn fused_ops(ker: &K, c: &[TI], ops: &[FusedKerSpec], expect: E) +where + K: MatMatMulKer, + TI: LADatum, + E: Fn(usize, usize, TI) -> TI, +{ + if !ker.is_supported_here() { + return; + } + assert!(c.len() == ker.mr() * ker.nr()); + let v = c.to_vec(); + let c = mmm_stride_storage(&v, ker.nr()); + let mut ops = ops.to_vec(); + ops.insert(0, FusedKerSpec::AddUnicast(c)); + ops.insert(0, FusedKerSpec::Clear); + ops.push(FusedKerSpec::Store(c)); + ops.push(FusedKerSpec::Done); + let expected = + (0..v.len()).map(|ix| expect(ix / ker.nr(), ix % ker.nr(), v[ix])).collect::>(); + let err = ker.kernel(&ops); + assert_eq!(err, 0); + display_error(&v, &expected, ker.mr(), ker.nr()); + assert_eq!(v, expected); +} + +pub fn return_c(ker: &K, v: &[TI]) +where + K: MatMatMulKer, + TI: LADatum, + usize: AsPrimitive, +{ + fused_ops::(ker, v, &[], |_, _, c| c + 1.as_() - 1.as_()) +} + +pub fn return_c_plus_d(ker: &K) +where + K: MatMatMulKer, + TI: LADatum, + TD: LADatum + AsPrimitive, + usize: AsPrimitive + AsPrimitive, +{ + let len = ker.mr() * ker.nr(); + let v: Vec = (0..len).map(|f| f.as_()).collect(); + let d: Vec = (0..len).map(|f| ((3 * f) % 7).as_()).collect(); + fused_ops::( + ker, + &v, + &[FusedKerSpec::AddUnicast(mmm_stride_storage(&d, ker.nr()))], + |row, col, c| c + d[row * ker.nr() + col].as_(), + ); +} + +pub fn per_col(ker: &K, op: impl Fn(*const TI) -> FusedKerSpec, f: impl Fn(TI, TI) -> TI) +where + K: MatMatMulKer, + TI: LADatum, + usize: AsPrimitive, +{ + let len = ker.mr() * ker.nr(); + let v: Vec = (0..len).map(|f| f.as_()).collect(); + let bias: Vec = (0..ker.nr()).map(|f| (f + 1).as_()).collect(); + fused_ops::(ker, &v, &[op(bias.as_ptr())], |_, col, c| f(bias[col], c)) +} + +pub fn per_row(ker: &K, op: impl Fn(*const TI) -> FusedKerSpec, f: impl Fn(TI, TI) -> TI) +where + K: MatMatMulKer, + TI: LADatum, + usize: AsPrimitive, +{ + let len = ker.mr() * ker.nr(); + let v: Vec = (0..len).map(|f| f.as_()).collect(); + let bias: Vec = (0..ker.mr()).map(|f| (f + 1).as_()).collect(); + fused_ops::(ker, &v, &[op(bias.as_ptr())], |row, _, c| f(bias[row], c)) +} + +pub fn scalar(ker: &K, op: impl Fn(TI) -> FusedKerSpec, f: impl Fn(TI, TI) -> TI) +where + K: MatMatMulKer, + TI: LADatum, + isize: AsPrimitive, +{ + let len = ker.mr() * ker.nr(); + let v: Vec = (0..len as isize).map(|f| (f - len as isize / 2).as_()).collect(); + let five: TI = 5.as_(); + fused_ops::(ker, &v, &[op(five)], |_, _, c| f(five, c)) +} + +pub fn return_c_add_row_col_product(ker: &K) +where + K: MatMatMulKer, + TI: LADatum, + usize: AsPrimitive, +{ + let len = ker.mr() * ker.nr(); + let v: Vec = (0..len).map(|f| (f + 1).as_()).collect(); + let rows: Vec = (0..ker.mr()).map(|f| (f + 3).as_()).collect(); + let cols: Vec = (0..ker.nr()).map(|f| (f + 2).as_()).collect(); + fused_ops::( + ker, + &v, + &[FusedKerSpec::AddRowColProducts(rows.as_ptr(), cols.as_ptr())], + |row, col, c| c + cols[col] * rows[row], + ) +} + +pub fn return_c_clear(ker: &K) +where + K: MatMatMulKer, + TI: LADatum, + usize: AsPrimitive, +{ + let len = ker.mr() * ker.nr(); + let v: Vec = (0..len).map(|f| f.as_()).collect(); + fused_ops::(ker, &v, &[FusedKerSpec::Clear], |_, _, _| 0.as_()) +} + +pub fn tile(ker: &K) -> BoxedStrategy> +where + K: MatMatMulKer, + TI: LADatum, + i8: AsPrimitive, +{ + let len = ker.mr() * ker.nr(); + proptest::collection::vec(any::().prop_map(|c| c.as_()), len..=len).boxed() +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/mod.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/mod.rs new file mode 100644 index 000000000..beb4fb25d --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/mod.rs @@ -0,0 +1,89 @@ +use crate::LADatum; + +#[macro_use] +pub mod fuse; +#[macro_use] +pub mod frame; +#[macro_use] +pub mod packed_packed; +#[macro_use] +pub mod q_scale; +#[macro_use] +pub mod store; + +#[cfg(test)] +macro_rules! test_mmm_kernel { + (f16, $ker:expr) => { + test_mmm_kernel_f16!($ker); + }; + (f32, $ker:expr) => { + test_mmm_kernel_f32!($ker); + }; + (f64, $ker:expr) => { + test_mmm_kernel_f64!($ker); + }; + (i32, $ker:expr) => { + test_mmm_kernel_i32!($ker); + }; +} + +#[macro_export] +macro_rules! test_mmm_kernel_f16 { + ($ker: expr) => { + mmm_packed_packed_tests!(&*$ker, f16f16:0); + mmm_frame_tests!(&*$ker, f16, f16, f16, f16); + mmm_kernel_fuse_tests!(&*$ker, f16, f16); + mmm_store_test!(&*$ker, f16); + }; +} + +#[macro_export] +macro_rules! test_mmm_kernel_f32 { + ($ker: expr) => { + mmm_packed_packed_tests!(&*$ker, f32f32:0); + mmm_frame_tests!(&*$ker, f32, f32, f32, f32); + mmm_kernel_fuse_tests!(&*$ker, f32, f32); + mmm_store_test!(&*$ker, f32); + }; +} + +#[macro_export] +macro_rules! test_mmm_kernel_f64 { + ($ker:expr) => { + mmm_packed_packed_tests!(&*$ker, f64f64:0); + mmm_frame_tests!(&*$ker, f64, f64, f64, f64); + mmm_kernel_fuse_tests!(&*$ker, f64, f64); + mmm_store_test!(&*$ker, f64); + }; +} + +#[macro_export] +macro_rules! test_mmm_kernel_i32 { + ($ker: expr) => { + mmm_packed_packed_tests!(&*$ker, i32i32:0); + mmm_kernel_fuse_tests!(&*$ker, i32, i32); + mmm_frame_tests!(&*$ker, i32, i32, i32, i32); + mmm_q_scale_tests!(&*$ker); + mmm_store_test!(&*$ker, i32); + }; +} + +pub fn display_error(v: &[TC], expected: &[TC], m: usize, n: usize) { + if v != expected { + for ixm in 0..m { + print!("|"); + for ixn in 0..n { + use nu_ansi_term::Color::*; + let f = v[ixm * n + ixn]; + let e = expected[ixm * n + ixn]; + let color = if f != e { Red.bold() } else { Green.into() }; + print!("{}|", color.paint(format!("{f:5}"))); + } + print!(" # "); + for ixn in 0..n { + print!("{:5} ", expected[ixm * n + ixn]); + } + println!(); + } + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/packed_packed.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/packed_packed.rs new file mode 100644 index 000000000..028ed2337 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/packed_packed.rs @@ -0,0 +1,382 @@ +use crate::block_quant::PackedBlockQuantFormat; +use crate::mmm::tests::display_error; +use crate::mmm::{AsInputValue, FusedKerSpec, FusedSpec, MatMatMul, MatMatMulKer, OutputStoreKer}; +use crate::pack::PackedFormat; +use proptest::collection::vec; +use proptest::prelude::*; +use std::fmt::Debug; +use tract_data::internal::*; + +#[macro_export] +macro_rules! mmm_packed_packed_tests { + ($ker:expr, $packing_id:ident : $packing: expr) => { + mod $packing_id { + use super::*; + #[allow(unused_imports)] + use proptest::prelude::*; + #[allow(unused_imports)] + use tract_data::prelude::f16; + use tract_data::prelude::*; + use tract_itertools::Itertools; + use $crate::frame::mmm::kernel::MatMatMulKer; + #[allow(unused_imports)] + use $crate::frame::mmm::tests::packed_packed::*; + + mod fuse { + use super::*; + + proptest::proptest! { + #[test] + fn prop(pb in arbitrary_problem(false, $ker, $packing)) { + pb.check().unwrap() + } + } + + fn t(a: impl Into>, b: impl Into>) -> TractResult<()> { + PackedPackedProblem::kernel($ker, $packing, a, b).check() + } + + #[test] + fn packed_packed_1() -> TractResult<()> { + t(vec![1f32; $ker.mr()], vec![1f32; $ker.nr()]) + } + + #[test] + fn packed_packed_2() -> TractResult<()> { + t(vec![1f32; $ker.mr() * 2], vec![1f32; $ker.nr() * 2]) + } + + #[test] + fn packed_packed_13() -> TractResult<()> { + t(vec![1f32; $ker.mr() * 13], vec![1f32; $ker.nr() * 13]) + } + + #[test] + fn packed_packed_a_scale() -> TractResult<()> { + t((1..=$ker.mr() as i64).map(|x| x as f32).collect_vec(), vec![1f32; $ker.nr()]) + } + + #[test] + fn packed_packed_a_scale_times_2() -> TractResult<()> { + t( + (1..=2 * $ker.mr() as i64).map(|x| x as f32).collect_vec(), + vec![1f32; $ker.nr() * 2], + ) + } + + #[test] + fn packed_packed_empty() -> TractResult<()> { + t(vec![0f32; 0], vec![0f32; 0]) + } + + #[test] + fn packed_packed_bug_1() -> TractResult<()> { + t(vec![0f32; $ker.mr()], vec![0f32; $ker.nr()]) + } + + #[test] + fn packed_packed_bug_2() -> TractResult<()> { + let mut a = vec![0f32; $ker.mr()]; + a[0] = 1.; + let mut b = vec![0f32; $ker.nr()]; + b[0] = 1.; + t(a, b) + } + + #[test] + fn packed_packed_bug_3() -> TractResult<()> { + if $ker.mr() >= 4 { + let mut a = vec![0f32; 2 * $ker.mr()]; + let mut b = vec![0f32; 2 * $ker.nr()]; + a[2] = -0.7548828f32; + a[3] = 0.23547363f32; + b[2 * $ker.nr() - 1] = 0.93603516; + t(a, b)?; + } + Ok(()) + } + + #[test] + fn packed_packed_bug_4() -> TractResult<()> { + if $ker.mr() > 16 { + let mut a = vec![0f32; $ker.mr()]; + let mut b = vec![0f32; $ker.nr()]; + a[16] = 1.; + b[0] = 1.; + t(a, b)?; + } + Ok(()) + } + } + + mod frame { + use super::*; + + proptest::proptest! { + #[test] + fn prop(pb in arbitrary_problem(true, $ker, $packing)) { + pb.check().unwrap() + } + } + + fn t( + m: usize, + n: usize, + a: impl Into>, + b: impl Into>, + ) -> TractResult<()> { + PackedPackedProblem::frame($ker, $packing, m, n, a, b).check() + } + + fn ti( + m: usize, + n: usize, + a: impl Into>, + b: impl Into>, + ) -> TractResult<()> { + let a = a.into().into_iter().map(|i| i as f32).collect_vec(); + let b = b.into().into_iter().map(|i| i as f32).collect_vec(); + t(m, n, a, b) + } + + #[test] + fn trivial_1x2() -> TractResult<()> { + ti(1, 2, [0], [0, 0]) + } + + #[test] + fn packed_packed_empty() -> TractResult<()> { + t($ker.mr(), $ker.nr(), [], []) + } + + #[test] + fn packed_packed_empty_2() -> TractResult<()> { + t(2 * $ker.mr(), 2 * $ker.nr(), [], []) + } + + #[test] + fn mat_mul_1() -> TractResult<()> { + ti(3, 2, [-3, 3, 5, -5, 6, 0, -6, -5, 0, 0, 9, 7], [-8, 5, 5, -3, 5, 7, -8, -1]) + } + + #[test] + fn mat_mul_2() -> TractResult<()> { + ti(1, 3, [122, 82], [0, 0, 37, 0, 0, 57]) + } + } + } + }; +} + +#[derive(Debug, new)] +pub struct PackedPackedProblem +where + K: MatMatMulKer, +{ + pub frame_test: Option<(usize, usize)>, + pub ker: K, + pub packing: usize, + pub a: Vec, + pub b: Vec, +} + +pub fn arbitrary_problem( + frame_test: bool, + ker: &K, + packing: usize, +) -> BoxedStrategy> { + let (mr, nr) = (ker.mr(), ker.nr()); + let item_range = if ker.internal_type().is_integer() { (-5f32)..5f32 } else { (-1f32)..1f32 }; + let (m_range, n_range) = + if frame_test { (1usize..3 * mr, 1usize..3 * nr) } else { (mr..mr + 1, nr..nr + 1) }; + let ker = ker.clone(); + (m_range, 0usize..40, n_range) + .prop_flat_map(move |(m, k, n)| { + ( + vec(item_range.clone(), k * m..=k * m), + vec(item_range.clone(), k * n..=k * n), + Just((m, n)), + ) + }) + .prop_map(move |(mut a, mut b, mn)| { + a.reverse(); + b.reverse(); + PackedPackedProblem { + frame_test: Some(mn).filter(|_| frame_test), + ker: ker.clone(), + packing, + a, + b, + } + }) + .boxed() +} + +impl PackedPackedProblem { + pub fn kernel( + ker: &K, + packing: usize, + a: impl Into>, + b: impl Into>, + ) -> PackedPackedProblem { + PackedPackedProblem { + frame_test: None, + ker: ker.clone(), + packing, + a: a.into(), + b: b.into(), + } + } + + pub fn frame( + ker: &K, + packing: usize, + m: usize, + n: usize, + a: impl Into>, + b: impl Into>, + ) -> PackedPackedProblem { + PackedPackedProblem { + frame_test: Some((m, n)), + ker: ker.clone(), + packing, + a: a.into(), + b: b.into(), + } + } + + pub fn mkn(&self) -> (usize, usize, usize) { + let (m, n) = self.frame_test.unwrap_or((self.ker.mr(), self.ker.nr())); + assert!(m != 0 && n != 0); + let k = self.a.len() / m; + assert_eq!(self.b.len() / n, k); + (m, k, n) + } + + pub fn padded_inputs(&self) -> TractResult<(Tensor, Tensor)> { + let (pack_a, pack_b) = &self.ker.packings()[self.packing]; + assert!(pack_b.k_alignment() == 1); + let (m, k, n) = self.mkn(); + let k_aligned = k.next_multiple_of(pack_a.k_alignment()); + + let mut a = Tensor::zero::(&[m, k_aligned])?; + for row in 0..m { + for col in 0..k { + a.to_array_view_mut()?[[row, col]] = self.a[col + k * row]; + } + } + if let Some(pf) = pack_a.downcast_ref::() { + a = a.cast_to_dt(pf.dt)?.into_owned(); + } + let mut b = Tensor::zero::(&[k_aligned, n])?; + for row in 0..k { + for col in 0..n { + b.to_array_view_mut()?[[row, col]] = self.b[col + n * row]; + } + } + if let Some(pf) = pack_b.downcast_ref::() { + b = b.cast_to_dt(pf.dt)?.into_owned(); + } + + Ok((a, b)) + } + + pub fn reference(&self) -> TractResult { + let (m, k, n) = self.mkn(); + let pack_a = &self.ker.packings()[self.packing].0; + let (mut a, b) = self.padded_inputs()?; + let k_aligned = k.next_multiple_of(pack_a.k_alignment()); + if let Some(pbqf) = pack_a.downcast_ref::() { + a = pbqf.simulate_precision_loss(a, 1)?; + }; + let mut c = Tensor::zero::(&[m, n])?; + + let a = a.cast_to::()?; + let a = a.as_slice::()?; + let b = b.cast_to::()?; + let b = b.as_slice::()?; + let mut view = c.to_array_view_mut::()?.into_dimensionality()?; + for ix_m in 0..m { + for ix_n in 0..n { + for ix_k in 0..k { + let a = a[ix_k + k_aligned * ix_m]; + let b = b[ix_n + n * ix_k]; + view[(ix_m, ix_n)] += a * b; + } + } + } + Ok(c) + } + + pub fn run(&self) -> TractResult { + let (m, k, n) = self.mkn(); + let (pack_a, pack_b) = &self.ker.packings()[self.packing]; + assert!(pack_b.k_alignment() == 1); + let k_aligned = k.next_multiple_of(pack_a.k_alignment()); + + let (a, b) = self.padded_inputs()?; + let pa = pack_a.prepare_one(&a, 1, 0)?; + let pb = pack_b.prepare_one(&b, 0, 1)?; + + let mut v = unsafe { Tensor::uninitialized_dt(self.ker.internal_type(), &[m, n])? }; + let item_size = self.ker.internal_type().size_of(); + + if self.frame_test.is_some() { + unsafe { + let c = self.ker.c_view(Some(0), Some(1)).wrap(&v.view_mut()); + let ops = tvec!( + FusedSpec::AddMatMul { + a: AsInputValue::Borrowed(&*pa), + b: AsInputValue::Borrowed(&*pb), + packing: self.packing + }, + FusedSpec::Store(c) + ); + self.ker.run(m, n, &ops)?; + } + } else { + let c = OutputStoreKer { + ptr: v.as_bytes_mut().as_mut_ptr(), + row_byte_stride: (item_size * self.ker.nr()) as isize, + col_byte_stride: item_size as isize, + item_size, + }; + + let non_linear_ops = tvec!( + FusedKerSpec::Clear, + FusedKerSpec::AddMatMul { + k: k_aligned, + pa: pa.panel_bytes(0, None)?, + pb: pb.panel_bytes(0, None)?, + packing: self.packing + }, + FusedKerSpec::Store(c), + FusedKerSpec::Done + ); + let err = self.ker.kernel(&non_linear_ops); + assert_eq!(err, 0); + } + Ok(v) + } + + pub fn check(&self) -> TractResult<()> { + if !self.ker.is_supported_here() { + return Ok(()); + } + let expected = self.reference()?; + let found = self.run()?; + let app = if K::Acc::datum_type() == f16::datum_type() { + Approximation::SuperApproximate + } else { + Approximation::Approximate + }; + let result = found.close_enough(&expected, app); + if result.is_err() { + let exp = expected.as_slice::()?; + let found = found.as_slice::()?; + let (m, _, n) = self.mkn(); + display_error(found, exp, m, n); + } + result + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/q_scale.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/q_scale.rs new file mode 100644 index 000000000..7d7678e05 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/q_scale.rs @@ -0,0 +1,176 @@ +use crate::frame::mmm::fuse::RoundingPolicy; +use crate::frame::mmm::MatMatMulKer; +use crate::generic::rounding::ScaleShiftAndRound; +use crate::mmm::{FusedKerSpec, FusedSpec}; +use crate::Scaler; +use proptest::prelude::*; + +use super::fuse::fused_ops; + +#[derive(Debug, new)] +pub struct QScaleProblem +where + K: MatMatMulKer, +{ + pub ker: K, + pub c: Vec, + pub scaler: Scaler, + pub boo: std::marker::PhantomData, +} + +pub fn arbitrary_qscale_problem>( + ker: &K, +) -> BoxedStrategy> { + use RoundingPolicy::*; + let ker = ker.clone(); + let len = ker.mr() * ker.nr(); + ( + proptest::collection::vec(-20i32..20, len..=len), + -5i32..5, + prop_oneof!(Just(1f32), 0f32..1f32), + proptest::prop_oneof![ + Just(Zero), + Just(Away), + Just(PlusInf), + Just(MinusInf), + Just(Odd), + Just(Even) + ], + ) + .prop_map(move |(c, scale_pot, scale_mult, policy)| QScaleProblem { + ker: ker.clone(), + c, + scaler: Scaler::new(scale_mult * 2f32.powi(scale_pot), policy), + boo: std::marker::PhantomData, + }) + .boxed() +} + +impl QScaleProblem +where + K: MatMatMulKer, +{ + pub fn run(&self) { + if !self.ker.is_supported_here() { + return; + } + if let FusedSpec::QScale(shift, policy, mult) = self.scaler.as_fused_spec() { + fused_ops::( + &self.ker, + &self.c, + &[FusedKerSpec::QScale(shift, policy, mult)], + |_, _, c| c.q_scale(self.scaler), + ) + } else if let FusedSpec::RoundingShiftRight(shift, policy) = self.scaler.as_fused_spec() { + fused_ops::( + &self.ker, + &self.c, + &[FusedKerSpec::RoundingShiftRight(shift, policy)], + |_, _, c| c.q_shr(shift, policy), + ) + } else if let FusedSpec::ShiftLeft(shift) = self.scaler.as_fused_spec() { + fused_ops::( + &self.ker, + &self.c, + &[FusedKerSpec::ShiftLeft(shift)], + |_, _, c| c.q_shl(shift), + ) + } else { + unreachable!() + } + } +} + +pub fn return_c_scale_bigpot(ker: &K) +where + K: MatMatMulKer, +{ + let ker = ker.clone(); + let len = ker.mr() * ker.nr(); + let v: Vec = (-(len as i32) / 2..).take(len).collect(); + fused_ops::(&ker, &v, &[FusedKerSpec::ShiftLeft(1)], |_, _, c| c.q_shl(1)) +} + +#[macro_export] +macro_rules! mmm_q_scale_tests { + ($ker:expr) => { + use $crate::frame::mmm::fuse::RoundingPolicy; + use $crate::frame::mmm::tests::q_scale::arbitrary_qscale_problem; + use $crate::frame::mmm::tests::q_scale::QScaleProblem; + use $crate::frame::mmm::MatMatMulKer; + use $crate::generic::Scaler; + // FIXME: Scaler should be arbitrary + macro_rules! test_q_scale { + ($policy: ident) => { + paste! { + #[test] + fn []() { + let ker = $ker; + let len = (ker.mr() * ker.nr()) as i64; + let v = (0..len).map(|i| (i - len / 2) as i32).collect(); + QScaleProblem::new(ker.clone(), v, Scaler::new(0.5f32, RoundingPolicy::$policy)).run() + } + + #[test] + fn []() { + let ker = $ker; + let len = (ker.mr() * ker.nr()) as i64; + let v = (0..len).map(|i| (i - len / 2) as i32).collect(); + QScaleProblem::new(ker.clone(), v, Scaler::new(-0.5f32, RoundingPolicy::$policy)).run() + } + + #[test] + fn []() { + let ker = $ker; + let len = (ker.mr() * ker.nr()) as i64; + let v = (0..len).map(|i| (i - len / 2) as i32).collect(); + QScaleProblem::new(ker.clone(), v, Scaler::new(0.25f32, RoundingPolicy::$policy)).run() + } + + #[test] + fn []() { + let ker = $ker; + let len = (ker.mr() * ker.nr()) as i64; + let v = (0..len).map(|i| (i - len / 2) as i32).collect(); + QScaleProblem::new(ker.clone(), v, Scaler::new(1f32 / 5., RoundingPolicy::$policy)).run() + } + + #[test] + fn []() { + let ker = $ker; + let len = (ker.mr() * ker.nr()) as i64; + let v = (0..len).map(|i| (i - len / 2) as i32).collect(); + QScaleProblem::new(ker.clone(), v, Scaler::new(4f32, RoundingPolicy::$policy)).run() + } + + #[test] + fn []() { + let ker = $ker; + let len = (ker.mr() * ker.nr()) as i64; + let v = (0..len).map(|i| (i - len / 2) as i32).collect(); + QScaleProblem::new(ker.clone(), v, Scaler::new(14., RoundingPolicy::$policy)).run() + } + } + } + } + + test_q_scale!(Zero); + test_q_scale!(Away); + test_q_scale!(MinusInf); + test_q_scale!(PlusInf); + test_q_scale!(Even); + test_q_scale!(Odd); + + proptest::proptest! { + #[test] + fn return_q_scale_prop(pb in arbitrary_qscale_problem($ker)) { + pb.run() + } + } + + #[test] + fn return_c_scale_bigpot() { + $crate::frame::mmm::tests::q_scale::return_c_scale_bigpot::<_>($ker) + } + }; +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/store.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/store.rs new file mode 100644 index 000000000..a44d533cd --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/store.rs @@ -0,0 +1,131 @@ +use crate::frame::mmm::fuse::FusedKerSpec; +use crate::frame::mmm::storage::*; +use crate::frame::mmm::tests::display_error; +use crate::frame::mmm::*; +use crate::LADatum; +use num_traits::Bounded; +use tract_data::internal::*; +use tract_itertools::Itertools; +use tract_ndarray::Axis; + +#[macro_export] +macro_rules! mmm_store_test { + ($ker:expr, $tc:ident) => { + paste! { + mod [] { + #[allow(unused_imports)] + use tract_data::prelude::f16; + use $crate::frame::mmm::tests::store::StoreLayout; + + #[test] fn store_zeros() { + $crate::frame::mmm::tests::store::store_zeros::<_,$tc,_>($ker); + } + + #[test] fn store_col_major() { + $crate::frame::mmm::tests::store::store_pattern::<_,$tc,_>($ker, StoreLayout::ColMajor); + } + + #[test] fn store_row_major() { + $crate::frame::mmm::tests::store::store_pattern::<_,$tc,_>($ker, StoreLayout::RowMajor); + } + + #[test] fn store_arbitrary() { + $crate::frame::mmm::tests::store::store_pattern::<_,$tc,_>($ker, StoreLayout::Arbitrary); + } + } + } + }; +} + +pub fn mmm_stride_storage(v: &[T], rsc: usize) -> OutputStoreKer { + OutputStoreKer { + ptr: v.as_ptr() as _, + row_byte_stride: (std::mem::size_of::() * rsc) as isize, + col_byte_stride: std::mem::size_of::() as isize, + item_size: std::mem::size_of::(), + } +} + +pub fn store_zeros(ker: &K) +where + K: MatMatMulKer, + TC: LADatum, + TI: LADatum + Bounded + PartialEq, +{ + if !ker.is_supported_here() { + return; + } + let v = vec![TC::max_value(); ker.mr() * ker.nr()]; + let c = mmm_stride_storage(&v, ker.nr()); + let non_linear = tvec![FusedKerSpec::Clear, FusedKerSpec::Store(c), FusedKerSpec::Done]; + let err = ker.kernel(&non_linear); + assert_eq!(err, 0); + let expected = vec![TC::zero(); v.len()]; + display_error(&v, &expected, ker.mr(), ker.nr()); + assert_eq!(v, expected); +} + +pub enum StoreLayout { + ColMajor, + RowMajor, + Arbitrary, +} + +pub fn store_pattern(ker: &K, layout: StoreLayout) +where + K: MatMatMulKer, + TC: LADatum, + TI: LADatum + Bounded + PartialEq, +{ + if !ker.is_supported_here() { + return; + } + let (mr, nr) = (ker.mr(), ker.nr()); + let pattern = tensor1(&(0..).take(mr * nr).collect_vec()) + .cast_to::() + .unwrap() + .into_owned() + .into_shape(&[mr, nr]) + .unwrap(); + let pattern_col_major = pattern.clone().permute_axes(&[1, 0]).unwrap(); + let size_of_tc = std::mem::size_of::(); + let (row_stride, col_stride, result_size) = match layout { + StoreLayout::RowMajor => (nr, 1, mr * nr), + StoreLayout::ColMajor => (1, mr, mr * nr), + // like row major, but storing every other third column + StoreLayout::Arbitrary => (nr * 3, 3, mr * nr * 3), + }; + let mut result = tensor0(TC::max_value()).broadcast_to_shape(&[result_size]).unwrap(); + let non_linear = tvec![ + unsafe { + FusedKerSpec::LoadTile(pattern_col_major.as_ptr_unchecked(), pattern.as_ptr_unchecked()) + }, + FusedKerSpec::Store(OutputStoreKer { + ptr: result.as_bytes_mut().as_mut_ptr(), + row_byte_stride: (size_of_tc * row_stride) as isize, + col_byte_stride: (size_of_tc * col_stride) as isize, + item_size: size_of_tc, + }), + FusedKerSpec::Done + ]; + let err = ker.kernel(&non_linear); + assert_eq!(err, 0); + let expected = pattern.cast_to::().unwrap().into_owned(); + let result = match layout { + StoreLayout::RowMajor => result, + StoreLayout::ColMajor => { + result.into_shape(&[ker.nr(), ker.mr()]).unwrap().permute_axes(&[1, 0]).unwrap() + } + StoreLayout::Arbitrary => result + .into_array::() + .unwrap() + .into_shape_with_order((mr, nr, 3)) + .unwrap() + .index_axis_move(Axis(2), 0) + .into_tensor(), + }; + let expected = expected.as_slice::().unwrap(); + let result = result.as_slice::().unwrap(); + display_error(result, expected, ker.mr(), ker.nr()); + assert_eq!(result, expected); +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/mod.rs b/vendor/tract-linalg-0.22.1/src/frame/mod.rs new file mode 100644 index 000000000..8528eea35 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/mod.rs @@ -0,0 +1,25 @@ +#[macro_use] +pub mod block_quant; +#[macro_use] +pub mod element_wise; +pub mod element_wise_helper; +#[macro_use] +pub mod unicast; +#[macro_use] +pub mod by_scalar; +#[macro_use] +pub mod leaky_relu; +#[macro_use] +pub mod lut; +#[macro_use] +pub mod mmm; +#[macro_use] +pub mod pack; +#[macro_use] +pub mod reduce; +#[macro_use] +pub mod sigmoid; +#[macro_use] +pub mod tanh; +#[macro_use] +pub mod weights; diff --git a/vendor/tract-linalg-0.22.1/src/frame/pack.rs b/vendor/tract-linalg-0.22.1/src/frame/pack.rs new file mode 100644 index 000000000..ffd65f3d4 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/pack.rs @@ -0,0 +1,1015 @@ +use std::alloc::Layout; +use std::fmt::{Debug, Display}; +use std::marker::PhantomData; +use std::ops::Range; +use std::sync::Arc; +use tract_data::internal::*; + +use crate::mmm::{EagerPackedInput, MMMInputFormat, MMMInputValue, PackedOpaqueFact}; + +use crate::WeightType; + +#[derive(Clone, Eq, PartialEq, Hash)] +pub struct PackedFormat { + pub dt: DatumType, + pub r: usize, + pub alignment_bytes: usize, + pub end_padding_record: usize, +} + +impl MMMInputFormat for PackedFormat { + fn prepare_tensor(&self, t: &Tensor, k_axis: usize, mn_axis: usize) -> TractResult { + let packed = PackedFormat::pack_tensor(self, t, k_axis, mn_axis)?; + Ok(tensor0(Opaque(Arc::new(packed)))) + } + + fn prepare_one( + &self, + t: &Tensor, + k_axis: usize, + mn_axis: usize, + ) -> TractResult> { + PackedFormat::pack_tensor(self, t, k_axis, mn_axis) + } + + fn precursor(&self) -> WeightType { + WeightType::Plain(self.dt) + } + + fn r(&self) -> usize { + self.r + } + + fn k_alignment(&self) -> usize { + 1 + } + + fn same_as(&self, other: &dyn MMMInputFormat) -> bool { + other.downcast_ref::().is_some_and(|other| self == other) + } + + #[allow(clippy::collapsible_if)] + fn merge_with<'o, 'a: 'o, 'b: 'o>( + &'a self, + other: &'b dyn MMMInputFormat, + ) -> Option<&'o dyn MMMInputFormat> { + if let Some(other) = other.downcast_ref::() { + if self.r == other.r && self.dt == other.dt { + if self.alignment_bytes % other.alignment_bytes == 0 + && self.end_padding_record >= other.end_padding_record + { + return Some(self); + } + if other.alignment_bytes % self.alignment_bytes == 0 + && other.end_padding_record >= self.end_padding_record + { + return Some(other); + } + } + } + None + } + + fn mem_size(&self, k: TDim, mn: TDim) -> TDim { + self.len(k, mn) * self.dt.size_of() + } + + fn extract_at_mn_f16( + &self, + data: &EagerPackedInput, + mn: usize, + slice: &mut [f16], + ) -> TractResult<()> { + ensure!(data.format().same_as(self)); + ensure!(self.len(data.k(), data.mn()) * self.dt.size_of() == data.packed.len()); + unsafe { + let ptr = data.packed.as_ptr().add( + (self.single_panel_len(data.k()) * (mn / self.r) + mn % self.r) * self.dt.size_of(), + ); + for (i, slot) in slice.iter_mut().enumerate() { + let ptr = ptr.add(i * self.dt.size_of() * self.r); + *slot = if self.dt == f16::datum_type() { + *(ptr as *const f16) + } else if self.dt == f32::datum_type() { + f16::from_f32(*(ptr as *const f32)) + } else { + bail!("Unexpected DT {:?}", self.dt) + } + } + } + Ok(()) + } + + fn extract_at_mn_f32( + &self, + data: &EagerPackedInput, + mn: usize, + slice: &mut [f32], + ) -> TractResult<()> { + ensure!(data.format().same_as(self)); + ensure!(self.len(data.k(), data.mn()) * self.dt.size_of() == data.packed.len()); + unsafe { + let ptr = data.packed.as_ptr().add( + (self.single_panel_len(data.k()) * (mn / self.r) + mn % self.r) * self.dt.size_of(), + ); + for (i, slot) in slice.iter_mut().enumerate() { + let ptr = ptr.add(i * self.dt.size_of() * self.r); + *slot = if self.dt == f16::datum_type() { + (*(ptr as *const f16)).to_f32() + } else if self.dt == f32::datum_type() { + *(ptr as *const f32) + } else { + bail!("Unexpected DT {:?}", self.dt) + } + } + } + Ok(()) + } +} + +impl Display for PackedFormat { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Packed{:?}[{}]", self.dt, self.r) + } +} + +impl Debug for PackedFormat { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "Packed{:?}[{}]@{}+{}", + self.dt, self.r, self.alignment_bytes, self.end_padding_record + ) + } +} + +impl PackedFormat { + pub const fn new(dt: DatumType, nr: usize, alignment_bytes: usize) -> PackedFormat { + PackedFormat { dt, r: nr, alignment_bytes, end_padding_record: 1 } + } + + pub const fn with_end_padding_record(self, end_padding_record: usize) -> Self { + PackedFormat { end_padding_record, ..self } + } + + #[inline] + pub fn align(self, alignment: usize) -> Self { + Self { alignment_bytes: alignment, ..self } + } + + #[inline] + pub fn alignment(&self) -> usize { + self.alignment_bytes + } + + #[inline] + pub fn panel_width(&self) -> usize { + self.r + } + + #[inline] + pub fn len(&self, k: D, n: D) -> D { + n.divceil(self.r) * self.single_panel_len(k) + } + + #[inline] + pub fn single_panel_len(&self, k: D) -> D { + ((k + self.end_padding_record) * self.r).divceil(self.alignment()) * self.alignment() + } + + #[inline] + pub fn single_panel_layout(&self, k: usize, item_size: usize) -> Layout { + Layout::from_size_align(self.single_panel_len(k) * item_size, self.alignment()).unwrap() + } + + pub fn pack_tensor( + &self, + t: &Tensor, + k_axis: usize, + mn_axis: usize, + ) -> TractResult> { + ensure!(t.datum_type().is_copy()); + ensure!( + t.datum_type().unquantized() == self.dt.unquantized(), + "Attempting to pack for {self} tensor {t:?}" + ); + let k = t.shape()[k_axis]; + let mn = t.shape()[mn_axis]; + let packed_len = self.len(k, mn); + let panel_len = self.single_panel_len(k); + let panel_bytes = panel_len * t.datum_type().size_of(); + let strides = t.strides(); + unsafe { + let mut packed = Blob::new_for_size_and_align( + t.datum_type().size_of() * packed_len, + self.alignment_bytes, + ); + if cfg!(debug_assertions) { + packed.as_bytes_mut().fill(0u8); + } + dispatch_copy!(Self::pack_t(t.datum_type())( + self, + packed.as_mut_ptr() as _, + t.as_ptr_unchecked(), + mn, + strides[k_axis], + strides[mn_axis], + 0..k, + 0..mn + )); + Ok(Box::new(EagerPackedInput { + fact: PackedOpaqueFact { format: Box::new(self.clone()), mn: mn.to_dim(), k }, + packed: packed.into(), + panel_bytes, + mn, + })) + } + } + + pub fn pack_tensor_view( + &self, + t: &TensorView, + k_axis: usize, + mn_axis: usize, + ) -> TractResult> { + ensure!( + t.datum_type().unquantized() == self.dt.unquantized(), + "Attempting to pack for {self} tensor view {t:?}" + ); + let k = t.shape()[k_axis]; + let mn = t.shape()[mn_axis]; + let packed_len = self.len(k, mn); + let panel_len = self.single_panel_len(k); + let panel_bytes = panel_len * t.datum_type().size_of(); + let strides = t.strides(); + unsafe { + let mut packed = Blob::new_for_size_and_align( + t.datum_type().size_of() * packed_len, + self.alignment_bytes, + ); + if cfg!(debug_assertions) { + packed.as_bytes_mut().fill(0u8); + } + dispatch_copy!(Self::pack_t(t.datum_type())( + self, + packed.as_mut_ptr() as _, + t.as_ptr_unchecked(), + mn, + strides[k_axis], + strides[mn_axis], + 0..k, + 0..mn + )); + Ok(Box::new(EagerPackedInput { + fact: PackedOpaqueFact { format: Box::new(self.clone()), mn: mn.to_dim(), k }, + packed: packed.into(), + panel_bytes, + mn, + })) + } + } + + pub unsafe fn pack<'a, 'b>( + &self, + pb: impl std::borrow::BorrowMut>, + b: impl std::borrow::Borrow>, + k_axis: usize, + mn_axis: usize, + ) { + let k = b.borrow().shape()[k_axis]; + let mn = b.borrow().shape()[mn_axis]; + unsafe { self.pack_segment(pb, b, k_axis, mn_axis, 0..k, 0..mn) }; + } + + + #[allow(clippy::too_many_arguments)] + #[rustfmt::skip] + pub unsafe fn pack_t( + &self, + pb: *mut T, + b: *const T, + mn: usize, + k_stride: isize, + mn_stride: isize, + k_range: Range, + mn_range: Range, + ) { unsafe { + if k_range.len() == 0 || mn_range.len() == 0 { + return + } + if self.r == 1 && k_stride == 1 && mn == 1 { + pb.copy_from_nonoverlapping(b.add(k_range.start), k_range.len()) + } else if mn_stride == 1 { + let size_of = T::datum_type().size_of(); + let rbytes = self.r * size_of; + let mn_valid_end = mn_range.end.min(mn); + let mn_range_bytes = mn_range.start * size_of..mn_valid_end * size_of; + let k_stride_bytes = k_stride * size_of as isize; + let bb = b as *const u8; + let pbb = pb as *mut u8; + let panel_len = self.single_panel_len(k_range.len()) * size_of; + match rbytes { + 16 => pack_mn_major::<[u8; 16]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range), + 24 => pack_mn_major::<[u8; 24]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range), + 32 => pack_mn_major::<[u8; 32]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range), + 48 => pack_mn_major::<[u8; 48]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range), + 64 => pack_mn_major::<[u8; 64]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range), + _ => { + let mut packer = self.write_with_k_outer(pb, k_range.len(), mn_range.len()); + for k in k_range { + for x in mn_range.start..mn_valid_end { + packer.write(*b.offset(x as isize + k_stride * k as isize)) + } + for _x in mn_valid_end..mn_range.end { + packer.write(T::default()) + } + } + } + } + } else if k_stride == 1 { + let mut packer = self.write_with_k_inner(pb, k_range.len(), mn); + let mn_valid_end = mn_range.end.min(mn); + for x in mn_range.start..mn_valid_end { + for k in k_range.clone() { + packer.write(*b.offset(x as isize * mn_stride + k as isize)) + } + } + // just ignore invalid mn_range + } else { + let mut packer = self.write_with_k_outer(pb, k_range.len(), mn); + let mn_valid_end = mn_range.end.min(mn); + for k in k_range { + for x in mn_range.start..mn_valid_end { + packer.write(*b.offset(x as isize * mn_stride + k_stride * k as isize)) + } + for _x in mn_valid_end..mn_range.end { + packer.write(T::default()) + } + } + } + }} + + #[inline] + pub unsafe fn pack_segment<'a, 'b>( + &self, + mut pb: impl std::borrow::BorrowMut>, + b: impl std::borrow::Borrow>, + k_axis: usize, + mn_axis: usize, + k_range: Range, + mn_range: Range, + ) { + debug_assert!(pb.borrow().len() >= self.len(k_range.len(), mn_range.len())); + let pb = pb.borrow_mut(); + let b = b.borrow(); + let dt = pb.datum_type(); + unsafe { + dispatch_copy!(Self::pack_t(dt)( + self, + pb.as_ptr_mut_unchecked(), + b.as_ptr_unchecked(), + b.shape()[mn_axis], + b.strides()[k_axis], + b.strides()[mn_axis], + k_range, + mn_range + )); + } + } + + pub fn write_with_k_outer<'p, T: Copy + Debug>( + &self, + pb: *mut T, + k: usize, + mn: usize, + ) -> KOutWriter<'p, T> { + KOutWriter::new(pb, self.r, self.single_panel_len(k), mn, k) + } + + pub fn write_single_panel_with_k_outer<'p, T: Copy + Debug>( + &self, + pb: *mut T, + ) -> KOutSinglePanelWriter<'p, T> { + KOutSinglePanelWriter::new(pb) + } + + pub fn write_with_k_inner<'p, T: Copy + Debug>( + &self, + pb: *mut T, + k: usize, + mn: usize, + ) -> KInWriter<'p, T> { + let panel_len = self.single_panel_len(k); + KInWriter::new(pb, panel_len, self.r, mn, k) + } +} + +pub trait PackingWriter { + fn write(&mut self, t: T); +} + +#[derive(Debug)] +pub struct KOutSinglePanelWriter<'p, T> +where + T: Copy + std::fmt::Debug, +{ + ptr: *mut T, + _phantom: PhantomData<&'p T>, +} + +impl<'p, T> KOutSinglePanelWriter<'p, T> +where + T: Copy + std::fmt::Debug, +{ + pub fn new(ptr: *mut T) -> KOutSinglePanelWriter<'p, T> { + KOutSinglePanelWriter { ptr, _phantom: PhantomData } + } +} + +impl PackingWriter for KOutSinglePanelWriter<'_, T> +where + T: Copy + std::fmt::Debug, +{ + #[inline(always)] + fn write(&mut self, t: T) { + unsafe { + *self.ptr = t; + self.ptr = self.ptr.offset(1); + } + } +} + +#[derive(Debug)] +pub struct KOutWriter<'p, T> +where + T: Copy + std::fmt::Debug, +{ + ptr: *mut T, + panels: usize, + panel_width: usize, + last_panel_width: usize, + remain: usize, + current_panel: usize, + next_panel: isize, + next_lane: isize, + _phantom: PhantomData<&'p T>, +} + +impl<'p, T> KOutWriter<'p, T> +where + T: Copy + std::fmt::Debug, +{ + pub fn new( + ptr: *mut T, + panel_width: usize, + panel_len: usize, + mn: usize, + _k: usize, + ) -> KOutWriter<'p, T> { + let panels = mn.divceil(panel_width); + let last_panel_width = mn - (panels - 1) * panel_width; + KOutWriter { + ptr, + panels, + panel_width, + last_panel_width, + remain: if panels > 1 { panel_width } else { last_panel_width }, + current_panel: 0, + next_panel: (panel_len - panel_width) as isize, + next_lane: (panel_width - last_panel_width) as isize + - (panel_len * (panels - 1)) as isize, + _phantom: PhantomData, + } + } +} + +impl PackingWriter for KOutWriter<'_, T> +where + T: Copy + std::fmt::Debug, +{ + #[inline(always)] + fn write(&mut self, t: T) { + unsafe { + *self.ptr = t; + self.remain -= 1; + self.ptr = self.ptr.offset(1); + if self.remain == 0 { + self.current_panel += 1; + if self.current_panel == self.panels { + self.ptr = self.ptr.offset(self.next_lane); + self.current_panel = 0; + } else { + self.ptr = self.ptr.offset(self.next_panel); + } + if self.current_panel == self.panels - 1 { + self.remain = self.last_panel_width; + } else { + self.remain = self.panel_width; + } + } + } + } +} + +#[derive(Debug)] +pub struct KInWriter<'p, T> +where + T: Copy + Debug, +{ + ptr: *mut T, + k: usize, + panels: usize, + panel_width: usize, + last_panel_width: usize, + remain_on_k: usize, + remain_on_mn: usize, + current_panel: usize, + next_mn_offset: isize, + next_panel_offset: isize, + _phantom: PhantomData<&'p T>, +} + +impl<'p, T> KInWriter<'p, T> +where + T: Copy + Debug, +{ + pub fn new( + ptr: *mut T, + panel_len: usize, + panel_width: usize, + mn: usize, + k: usize, + ) -> KInWriter<'p, T> { + let panels = mn.divceil(panel_width); + let last_panel_width = mn - (panels - 1) * panel_width; + KInWriter { + ptr, + k, + panels, + panel_width, + last_panel_width, + remain_on_k: k, + remain_on_mn: if panels == 1 { last_panel_width } else { panel_width }, + current_panel: 0, + next_mn_offset: 1 - (k * panel_width) as isize, + next_panel_offset: panel_len as isize - (k * panel_width + panel_width - 1) as isize, + // ^ next panel ^ ^ rewind left ^ ^ rewind up ^ + _phantom: PhantomData, + } + } +} + +impl PackingWriter for KInWriter<'_, T> +where + T: Copy + std::fmt::Debug, +{ + #[inline(always)] + fn write(&mut self, t: T) { + unsafe { + *self.ptr = t; + self.remain_on_k -= 1; + self.ptr = self.ptr.add(self.panel_width); + if self.remain_on_k == 0 { + self.remain_on_k = self.k; + self.remain_on_mn -= 1; + if self.remain_on_mn > 0 { + self.ptr = self.ptr.offset(self.next_mn_offset); + } else { + self.ptr = self.ptr.offset(self.next_panel_offset); + self.current_panel += 1; + if self.current_panel == self.panels - 1 { + self.remain_on_mn = self.last_panel_width; + } else { + self.remain_on_mn = self.panel_width; + } + } + } + } + } +} + +#[inline(never)] +unsafe fn pack_mn_major( + b: *const u8, + packed: *mut u8, + panel_len: usize, + k_stride_bytes: isize, + mn_range_bytes: Range, + k_range: Range, +) { + unsafe { + let mnr = std::mem::size_of::(); + let full_panes = mn_range_bytes.len() / mnr; + let partial_pane = mn_range_bytes.len() % mnr; + for k in 0..k_range.len() { + let mut p_row = packed.add(k * mnr); + let mut b_row = b.offset( + (k_range.start + k) as isize * k_stride_bytes + mn_range_bytes.start as isize, + ); + for _ in 0..full_panes { + p_row.copy_from_nonoverlapping(b_row, mnr); + p_row = p_row.add(panel_len); + b_row = b_row.add(mnr); + } + if partial_pane > 0 { + p_row.copy_from_nonoverlapping(b_row, partial_pane); + } + } + } +} + +pub trait Packing { + fn packing(r: usize) -> PackedFormat; +} + +impl Packing for D { + fn packing(r: usize) -> PackedFormat { + PackedFormat::new(Self::datum_type(), r, vector_size()) + } +} + +#[cfg(test)] +mod test { + use std::ops::Range; + + use proptest::prelude::*; + use tract_data::internal::num_integer::Integer; + use tract_data::internal::tract_ndarray::Zip; + use tract_data::internal::*; + use tract_ndarray::prelude::*; + + #[derive(Debug)] + struct PackProblem { + k: usize, + mn: usize, + is_a: bool, + r: usize, + k_range: Range, + mn_range: Range, + align_panel: usize, + } + + impl PackProblem { + fn input(&self) -> Array2 { + let shape = if self.is_a { (self.mn, self.k) } else { (self.k, self.mn) }; + let data = (0..(self.k * self.mn) as u32).collect(); + Array2::from_shape_vec(shape, data).unwrap() + } + + fn packer(&self) -> Array2 { + let panels = self.mn_range.len().divceil(self.r); + let packer = super::PackedFormat::new(u32::datum_type(), self.r, self.align_panel) + .with_end_padding_record(0); + let input = self.input().into_tensor(); + let panel_len = packer.single_panel_len(self.k_range.len()); + let mut output = + Tensor::zero::(&[packer.len(self.k_range.len(), self.mn_range.len())]) + .unwrap(); + unsafe { + packer.pack_segment( + output.view_mut(), + input.view(), + self.is_a as usize, + !self.is_a as usize, + self.k_range.clone(), + self.mn_range.clone(), + ) + }; + output.into_array::().unwrap().into_shape_with_order((panels, panel_len)).unwrap() + } + + fn reference(&self) -> Array2 { + let input = self.input(); + let panels = self.mn_range.len().divceil(self.r); + let len = Integer::next_multiple_of(&(self.k_range.len() * self.r), &self.align_panel); + Array2::from_shape_fn([panels, len], |(panel, z)| { + let k = z / self.r; + let x = z % self.r; + let mn = panel * self.r + x + self.mn_range.start; + let k = k + self.k_range.start; + let coords = if self.is_a { (mn, k) } else { (k, mn) }; + *input.get(coords).unwrap_or(&0) + }) + } + + fn valid(&self) -> Array2 { + let panels = self.mn_range.len().divceil(self.r); + let len = Integer::next_multiple_of(&(self.k_range.len() * self.r), &self.align_panel); + Array2::from_shape_fn([panels, len], |(panel, z)| { + let k = z / self.r; + let x = z % self.r; + let k = k + self.k_range.start; + let mn = panel * self.r + x + self.mn_range.start; + k < self.k_range.end.min(self.k) && mn < self.mn_range.end.min(self.mn) + }) + } + + fn check(&self) { + let mut packer = self.packer(); + let mut reference = self.reference(); + let valid = self.valid(); + Zip::from(&mut packer).and(&valid).for_each(|p, v| *p = if *v { *p } else { -1 as _ }); + Zip::from(&mut reference) + .and(&valid) + .for_each(|p, v| *p = if *v { *p } else { -1 as _ }); + assert_eq!(packer, reference); + } + } + + impl Arbitrary for PackProblem { + type Parameters = (); + type Strategy = BoxedStrategy; + fn arbitrary_with(_args: ()) -> Self::Strategy { + (any::(), 1usize..9, 1usize..20, 1usize..20) + .prop_flat_map(|(is_a, r, k, mn)| { + ( + Just((is_a, r, k, mn)), + sub_range_strat(0..k), + sub_range_strat(0..mn), + 1usize..5, + ) + }) + .prop_map(|((is_a, r, k, mn), k_range, mn_range, align_panel)| PackProblem { + k, + mn, + is_a, + r, + k_range, + mn_range, + align_panel, + }) + .boxed() + } + } + + fn sub_range_strat(range: Range) -> BoxedStrategy> { + (0..range.len()) + .prop_flat_map(|cropped| (Just(cropped), 0..=cropped)) + .prop_map(move |(cropped, left)| range.start + left..range.end - (cropped - left)) + .boxed() + } + + proptest::proptest! { + #[test] + fn prop(pb in any::()) { + pb.check(); + } + + #[test] + fn subrange_prop(_range in sub_range_strat(0..20)) { + } + + } + + #[test] + fn simple_b_1() { + PackProblem { + k: 2, + mn: 1, + is_a: false, + r: 1, + k_range: 0..2, + mn_range: 0..1, + align_panel: 1, + } + .check(); + } + + #[test] + fn simple_b_2() { + PackProblem { + k: 2, + mn: 2, + is_a: false, + r: 1, + k_range: 0..2, + mn_range: 0..2, + align_panel: 1, + } + .check() + } + + #[test] + fn simple_b_3() { + PackProblem { + k: 2, + mn: 1, + is_a: false, + r: 4, + k_range: 0..2, + mn_range: 0..1, + align_panel: 1, + } + .check(); + } + + #[test] + fn simple_b_4() { + PackProblem { + k: 1, + mn: 3, + is_a: false, + r: 2, + k_range: 0..1, + mn_range: 0..3, + align_panel: 1, + } + .check(); + } + + #[test] + fn simple_a_1() { + PackProblem { + k: 2, + mn: 2, + is_a: true, + r: 1, + k_range: 0..2, + mn_range: 0..2, + align_panel: 1, + } + .check(); + } + + #[test] + fn simple_a_2() { + PackProblem { + k: 2, + mn: 3, + is_a: true, + r: 2, + k_range: 0..2, + mn_range: 0..3, + align_panel: 1, + } + .check(); + } + + #[test] + fn range_k_0() { + PackProblem { + k: 2, + mn: 1, + is_a: false, + r: 1, + k_range: 1..2, + mn_range: 0..1, + align_panel: 1, + } + .check(); + } + + #[test] + fn range_k_1() { + PackProblem { + k: 2, + mn: 2, + is_a: false, + r: 1, + k_range: 0..2, + mn_range: 0..1, + align_panel: 1, + } + .check(); + } + + #[test] + fn range_k_2() { + PackProblem { + k: 2, + mn: 1, + is_a: false, + r: 6, + k_range: 1..2, + mn_range: 0..1, + align_panel: 1, + } + .check(); + } + + #[test] + fn range_mn_0() { + PackProblem { + k: 1, + mn: 2, + is_a: false, + r: 2, + k_range: 0..1, + mn_range: 0..1, + align_panel: 1, + } + .check(); + } + + #[test] + fn range_b_4() { + PackProblem { + k: 1, + mn: 2, + is_a: false, + r: 6, + k_range: 0..1, + mn_range: 1..2, + align_panel: 1, + } + .check(); + } + + #[test] + fn range_b_5() { + PackProblem { + k: 1, + mn: 7, + is_a: false, + r: 6, + k_range: 0..1, + mn_range: 1..7, + align_panel: 1, + } + .check(); + } + + #[test] + fn align_a_1() { + PackProblem { + k: 2, + mn: 2, + is_a: true, + r: 1, + k_range: 0..1, + mn_range: 0..2, + align_panel: 2, + } + .check(); + } + + #[test] + fn align_b_1() { + PackProblem { + k: 1, + mn: 1, + is_a: false, + r: 1, + k_range: 0..1, + mn_range: 0..1, + align_panel: 2, + } + .check(); + } + + #[test] + fn align_b_2() { + PackProblem { + k: 3, + mn: 1, + is_a: false, + r: 1, + k_range: 0..3, + mn_range: 0..1, + align_panel: 2, + } + .check(); + } + + #[test] + fn align_b_3() { + PackProblem { + k: 1, + mn: 1, + is_a: false, + r: 3, + k_range: 0..1, + mn_range: 0..1, + align_panel: 2, + } + .check(); + } + + #[test] + fn align_b_4() { + PackProblem { + k: 2, + mn: 1, + is_a: false, + r: 1, + k_range: 0..1, + mn_range: 0..1, + align_panel: 2, + } + .check(); + } + + #[test] + fn align_b_5() { + PackProblem { + k: 1, + mn: 5, + is_a: false, + r: 4, + k_range: 0..1, + mn_range: 0..5, + align_panel: 3, + } + .check(); + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/reduce/max.rs b/vendor/tract-linalg-0.22.1/src/frame/reduce/max.rs new file mode 100644 index 000000000..616e4310b --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/reduce/max.rs @@ -0,0 +1,42 @@ +#[cfg(test)] +#[macro_use] +pub mod test { + use crate::frame::reduce::ReduceKer; + use crate::LADatum; + use num_traits::{AsPrimitive, Float}; + use proptest::test_runner::TestCaseResult; + + #[macro_export] + macro_rules! max_frame_tests { + ($cond:expr, $t: ty, $ker:ty) => { + proptest::proptest! { + #[test] + fn prop(xs in proptest::collection::vec(-25f32..25.0, 0..100)) { + if $cond { + $crate::frame::reduce::max::test::test_max::<$ker, $t>(&*xs).unwrap() + } + } + } + + #[test] + fn empty() { + if $cond { + $crate::frame::reduce::max::test::test_max::<$ker, $t>(&[]).unwrap() + } + } + }; + } + + pub fn test_max, T: LADatum + Float>(values: &[f32]) -> TestCaseResult + where + f32: AsPrimitive, + { + crate::setup_test_logger(); + let values: Vec = values.iter().copied().map(|x| x.as_()).collect(); + crate::frame::reduce::test::test_reduce::( + &values, + ::min_value(), + |a, b| a.max(b), + ) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/reduce/mod.rs b/vendor/tract-linalg-0.22.1/src/frame/reduce/mod.rs new file mode 100644 index 000000000..ecc13535f --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/reduce/mod.rs @@ -0,0 +1,300 @@ +pub mod max; +pub mod softmax; +pub mod sum; + +use std::fmt::Debug; +use std::marker::PhantomData; + +use tract_data::TractResult; + +use crate::LADatum; + +use super::element_wise_helper::{map_reduce_slice_with_alignment, reduce_slice_with_alignment}; + +macro_rules! reduce_impl_wrap { + ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $params: ty, $neutral: expr, $run: item, $reduce_two: item) => { + paste! { + #[derive(Copy, Clone, Debug)] + #[allow(non_camel_case_types)] + pub struct $func; + + impl crate::frame::reduce::ReduceKer<$ti, $params> for $func { + #[inline(always)] + fn name() -> &'static str { + stringify!($func) + } + #[inline(always)] + fn nr() -> usize { + $nr + } + #[inline(always)] + fn alignment_items() -> usize { + $alignment_items + } + #[inline(always)] + fn alignment_bytes() -> usize { + $alignment_items * std::mem::size_of::<$ti>() + } + #[inline(always)] + fn neutral() -> $ti { + $neutral + } + $run + $reduce_two + } + } + }; +} + +pub trait Reduce: Send + Sync + Debug + dyn_clone::DynClone +where + Params: Copy + Send + Sync + Debug + 'static + Default, + T: Copy + Debug + PartialEq + Send + Sync, +{ + fn name(&self) -> &'static str; + fn run(&self, vec: &[T]) -> TractResult { + self.run_with_params(vec, Params::default()) + } + fn run_with_params(&self, vec: &[T], params: Params) -> TractResult; +} + +dyn_clone::clone_trait_object!( Reduce where T: Copy, Params: Copy); + +#[derive(Debug, Clone, new)] +pub struct ReduceImpl +where + T: LADatum, + Params: Copy + Send + Sync + Debug + 'static + Default, + K: ReduceKer + Clone, +{ + phantom: PhantomData<(K, T, Params)>, +} + +impl Reduce for ReduceImpl +where + T: LADatum, + Params: Copy + Send + Sync + Debug + 'static + Default, + K: ReduceKer + Clone, +{ + fn name(&self) -> &'static str { + K::name() + } + + fn run_with_params(&self, vec: &[T], params: Params) -> TractResult { + reduce_slice_with_alignment( + vec, + |data| K::run(data, params), + K::nr(), + K::alignment_bytes(), + K::neutral(), + K::reduce_two, + ) + } +} + +pub trait ReduceKer: + Send + Sync + Debug + dyn_clone::DynClone + Clone + 'static +where + Params: Copy + Send + Sync + Debug + 'static + Default, + T: LADatum, +{ + fn name() -> &'static str; + fn alignment_bytes() -> usize { + Self::alignment_items() * T::datum_type().size_of() + } + fn alignment_items() -> usize; + fn nr() -> usize; + fn neutral() -> T; + fn reduce_two(a: T, b: T) -> T; + fn run(vec: &[T], params: Params) -> T; + fn red() -> Box> { + Box::new(ReduceImpl::::new()) + } +} + +#[allow(unused_macros)] +macro_rules! map_reduce_impl_wrap { + ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $params: ty, $map_neutral: expr, $reduce_neutral: expr, $run: item, $reduce_two: item) => { + paste! { + #[derive(Copy, Clone, Debug)] + #[allow(non_camel_case_types)] + pub struct $func; + + impl crate::frame::reduce::MapReduceKer<$ti, $params> for $func { + #[inline(always)] + fn name() -> &'static str { + stringify!($func) + } + #[inline(always)] + fn nr() -> usize { + $nr + } + #[inline(always)] + fn alignment_items() -> usize { + $alignment_items + } + #[inline(always)] + fn alignment_bytes() -> usize { + $alignment_items * std::mem::size_of::<$ti>() + } + #[inline(always)] + fn map_neutral() -> $ti { + $map_neutral + } + #[inline(always)] + fn reduce_neutral() -> $ti { + $reduce_neutral + } + $run + $reduce_two + } + } + }; +} + +pub trait MapReduce: Send + Sync + Debug + dyn_clone::DynClone +where + Params: Copy + Send + Sync + Debug + 'static + Default, + T: Copy + Debug + PartialEq + Send + Sync, +{ + fn name(&self) -> &'static str; + fn run(&self, vec: &mut [T]) -> TractResult { + self.run_with_params(vec, Params::default()) + } + fn run_with_params(&self, vec: &mut [T], params: Params) -> TractResult; +} + +dyn_clone::clone_trait_object!( MapReduce where T: Copy, Params: Copy); + +#[derive(Debug, Clone, new)] +pub struct MapReduceImpl +where + T: LADatum, + Params: Copy + Send + Sync + Debug + 'static + Default, + K: MapReduceKer + Clone, +{ + phantom: PhantomData<(K, T, Params)>, +} + +impl MapReduce for MapReduceImpl +where + T: LADatum, + Params: Copy + Send + Sync + Debug + 'static + Default, + K: MapReduceKer + Clone, +{ + fn name(&self) -> &'static str { + K::name() + } + fn run_with_params(&self, vec: &mut [T], params: Params) -> TractResult { + map_reduce_slice_with_alignment( + vec, + |data| K::run(data, params), + K::nr(), + K::alignment_bytes(), + K::map_neutral(), + K::reduce_neutral(), + K::reduce_two, + ) + } +} + +pub trait MapReduceKer: + Send + Sync + Debug + dyn_clone::DynClone + Clone + 'static +where + Params: Copy + Send + Sync + Debug + 'static + Default, + T: LADatum, +{ + fn name() -> &'static str; + fn alignment_bytes() -> usize { + Self::alignment_items() * T::datum_type().size_of() + } + fn alignment_items() -> usize; + fn nr() -> usize; + fn map_neutral() -> T; + fn reduce_neutral() -> T; + fn reduce_two(a: T, b: T) -> T; + fn run(vec: &mut [T], params: Params) -> T; + fn red() -> Box> { + Box::new(MapReduceImpl::::new()) + } +} + +#[cfg(test)] +pub mod test { + use super::*; + use proptest::test_runner::{TestCaseError, TestCaseResult}; + use tract_data::internal::*; + use tract_data::itertools::Itertools; + + pub fn test_reduce, T: LADatum>( + values: &[T], + neutral: T, + reference_reduce: impl Fn(T, T) -> T, + ) -> TestCaseResult { + test_reduce_params::(values, neutral, reference_reduce, ()) + } + + pub fn test_reduce_params, T: LADatum, Params>( + values: &[T], + neutral: T, + reference_reducer: impl Fn(T, T) -> T, + params: Params, + ) -> TestCaseResult + where + Params: Copy + Send + Sync + Debug + 'static + Default, + { + crate::setup_test_logger(); + let op = K::red(); + let expected = values.iter().fold(neutral, |acc, i| reference_reducer(acc, *i)); + let found = values; + let red = op.run_with_params(found, params).unwrap(); + tensor0(red) + .close_enough(&tensor0(expected), true) + .map_err(|e| TestCaseError::fail(e.root_cause().to_string()))?; + Ok(()) + } + + pub fn test_map_reduce, T: LADatum>( + values: &[T], + map_neutral: T, + neutral: T, + reference_map: impl Fn(T) -> T, + reference_reduce: impl Fn(T, T) -> T, + ) -> TestCaseResult { + test_map_reduce_params::( + values, + map_neutral, + neutral, + reference_map, + reference_reduce, + (), + ) + } + + pub fn test_map_reduce_params, T: LADatum, Params>( + values: &[T], + _neutral: T, + map_neutral: T, + reference_map: impl Fn(T) -> T, + reference_reducer: impl Fn(T, T) -> T, + params: Params, + ) -> TestCaseResult + where + Params: Copy + Send + Sync + Debug + 'static + Default, + { + crate::setup_test_logger(); + let op = K::red(); + let mut found = values.to_vec(); + let expected_values = values.iter().copied().map(reference_map).collect_vec(); + let expected_reduced = + expected_values.iter().fold(map_neutral, |acc, i| reference_reducer(acc, *i)); + let red = op.run_with_params(&mut found, params).unwrap(); + tensor1(&found) + .close_enough(&tensor1(&expected_values), Approximation::SuperApproximate) + .map_err(|e| TestCaseError::fail(e.root_cause().to_string()))?; + tensor0(red) + .close_enough(&tensor0(expected_reduced), Approximation::SuperApproximate) + .map_err(|e| TestCaseError::fail(e.root_cause().to_string()))?; + Ok(()) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/reduce/softmax.rs b/vendor/tract-linalg-0.22.1/src/frame/reduce/softmax.rs new file mode 100644 index 000000000..a51708643 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/reduce/softmax.rs @@ -0,0 +1,86 @@ +#[cfg(test)] +#[macro_use] +pub mod test { + use crate::frame::reduce::MapReduceKer; + use crate::LADatum; + use num_traits::{AsPrimitive, Float}; + use proptest::test_runner::TestCaseResult; + + #[macro_export] + macro_rules! softmax_l2_frame_tests { + ($cond:expr, $t: ty, $ker:ty) => { + proptest::proptest! { + #[test] + fn prop(xs in proptest::collection::vec(-25f32..25.0, 1..100)) { + if $cond { + $crate::frame::reduce::softmax::test::test_softmax_l2::<$ker, $t>(&*xs).unwrap() + } + } + } + + #[test] + fn single() { + if $cond { + $crate::frame::reduce::softmax::test::test_softmax_l2::<$ker, $t>(&[0.0]).unwrap() + } + } + + #[test] + fn two_zeros() { + if $cond { + $crate::frame::reduce::softmax::test::test_softmax_l2::<$ker, $t>(&[0.0, 0.0]).unwrap() + } + } + + #[test] + fn two_0() { + if $cond { + $crate::frame::reduce::softmax::test::test_softmax_l2::<$ker, $t>(&[ + 16.62555, 21.950674, + ]) + .unwrap() + } + } + + #[test] + fn two_1() { + if $cond { + $crate::frame::reduce::softmax::test::test_softmax_l2::<$ker, $t>(&[0.0f32, 0.38132212]) + .unwrap() + } + } + + #[test] + fn two_missing_max() { + if $cond { + $crate::frame::reduce::softmax::test::test_softmax_l2::<$ker, $t>(&[ + -46.15512, 42.875168, + ]) + .unwrap() + } + } + }; + } + + pub fn test_softmax_l2, T>( + values: &[f32], + ) -> TestCaseResult + where + T: LADatum + Float + AsPrimitive, + f32: AsPrimitive, + { + use crate::generic::reduce::softmax_l2::fast_compact_exp_f32; + crate::setup_test_logger(); + let max = values.iter().max_by(|a, b| a.total_cmp(b)).unwrap(); + let values: Vec = values.iter().copied().map(|x| x.as_()).collect(); + crate::frame::reduce::test::test_map_reduce_params::( + &values, + ::min_value(), + T::zero(), + // |x| (x - max.as_()).exp(), + |x| fast_compact_exp_f32(x.as_() - max).as_(), + |a, b| a + b, + max.as_(), + ) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/reduce/sum.rs b/vendor/tract-linalg-0.22.1/src/frame/reduce/sum.rs new file mode 100644 index 000000000..16d4b8970 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/reduce/sum.rs @@ -0,0 +1,54 @@ +#[cfg(test)] +#[macro_use] +pub mod test { + use crate::frame::reduce::ReduceKer; + use crate::LADatum; + use num_traits::{AsPrimitive, Float, Zero}; + use proptest::test_runner::TestCaseResult; + + #[macro_export] + macro_rules! sum_frame_tests { + ($cond:expr, $t: ty, $ker:ty) => { + proptest::proptest! { + #[test] + fn prop(xs in proptest::collection::vec(-25_isize..25, 0..100)) { + if $cond { + let xs_float = xs.into_iter().map(|it| it as f32).collect::>(); + $crate::frame::reduce::sum::test::test_sum::<$ker, $t>(&*xs_float).unwrap() + } + } + } + + #[test] + fn empty() { + if $cond { + $crate::frame::reduce::sum::test::test_sum::<$ker, $t>(&[]).unwrap() + } + } + + #[test] + fn simple() { + if $cond { + $crate::frame::reduce::sum::test::test_sum::<$ker, $t>(&[1.0, 2.0]).unwrap() + } + } + #[test] + fn multiple_tile() { + if $cond { + $crate::frame::reduce::sum::test::test_sum::<$ker, $t>(&[1.0; 35]).unwrap() + } + } + }; + } + + pub fn test_sum(values: &[f32]) -> TestCaseResult + where + K: ReduceKer, + f32: AsPrimitive, + T: LADatum + Float + Zero + AsPrimitive, + { + crate::setup_test_logger(); + let values: Vec = values.iter().copied().map(|x| x.as_()).collect(); + crate::frame::reduce::test::test_reduce::(&values, ::zero(), |a, b| a + b) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/sigmoid.rs b/vendor/tract-linalg-0.22.1/src/frame/sigmoid.rs new file mode 100644 index 000000000..1a3ea85dc --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/sigmoid.rs @@ -0,0 +1,96 @@ +macro_rules! sigmoid_impl { + ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $cond: expr) => { + ew_impl!($ti, $func, $nr, $alignment_items); + #[cfg(test)] + paste! { + mod [] { + use super::*; + sigmoid_frame_tests!($cond, $ti, $func); + } + } + }; +} + +#[cfg(test)] +#[macro_use] +pub mod test { + use crate::{frame::element_wise::*, LADatum}; + use num_traits::{AsPrimitive, Float}; + use proptest::test_runner::TestCaseResult; + + #[macro_export] + macro_rules! sigmoid_frame_tests { + ($cond:expr, $t: ty, $ker:ty) => { + proptest::proptest! { + #[test] + fn sigmoid(xs in proptest::collection::vec(-25f32..25.0, 0..100)) { + if $cond { + $crate::frame::sigmoid::test::test_sigmoid::<$ker, $t>(&*xs).unwrap() + } + } + } + + #[test] + fn sigmoid_4_magic() { + if $cond { + $crate::frame::sigmoid::test::test_sigmoid::<$ker, $t>(&[ + 0f32, -20.0, 20.0, 0.0, + ]) + .unwrap() + } + } + + #[test] + fn sigmoid_4zeros() { + if $cond { + $crate::frame::sigmoid::test::test_sigmoid::<$ker, $t>(&[0.0; 4]).unwrap(); + } + } + + #[test] + fn sigmoid_20_ones() { + if $cond { + $crate::frame::sigmoid::test::test_sigmoid::<$ker, $t>(&[1.0; 20]).unwrap(); + } + } + + #[test] + fn sigmoid_18_zeros() { + if $cond { + $crate::frame::sigmoid::test::test_sigmoid::<$ker, $t>(&[0.0; 18]).unwrap(); + } + } + + #[test] + fn sigmoid_asymptots() { + use tract_data::internal::*; + use $crate::frame::element_wise::*; + if $cond { + let mut input: Vec<$t> = [-100f32, 100f32] + .iter() + .map(|x| >::as_(*x)) + .collect(); + let expected: Vec<$t> = [-0f32, 1f32] + .iter() + .map(|x| >::as_(*x)) + .collect(); + <$ker>::ew().run(&mut input).unwrap(); + tensor1(&input) + .close_enough(&tensor1(&expected), Approximation::Close) + .unwrap(); + } + } + }; + } + + pub fn test_sigmoid, T: LADatum + Float>(values: &[f32]) -> TestCaseResult + where + f32: AsPrimitive, + { + crate::setup_test_logger(); + let values: Vec = values.iter().copied().map(|x| x.as_()).collect(); + crate::frame::element_wise::test::test_element_wise::(&values, |x| { + (1f32).as_() / (1f32.as_() + (-x).exp()) + }) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/tanh.rs b/vendor/tract-linalg-0.22.1/src/frame/tanh.rs new file mode 100644 index 000000000..fe2af1648 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/tanh.rs @@ -0,0 +1,101 @@ +macro_rules! tanh_impl { + ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $cond: expr) => { + ew_impl!($ti, $func, $nr, $alignment_items); + #[cfg(test)] + paste! { + mod [] { + use super::*; + tanh_frame_tests!($cond, $ti, $func); + } + } + }; +} + +#[cfg(test)] +#[macro_use] +pub mod test { + use crate::frame::element_wise::*; + use crate::LADatum; + use num_traits::float::Float; + use num_traits::AsPrimitive; + use proptest::test_runner::TestCaseResult; + + #[macro_export] + macro_rules! tanh_frame_tests { + ($cond:expr, $t:ty, $ker:ty) => { + proptest::proptest! { + #[test] + fn tanh(xs in proptest::collection::vec(-25f32..25.0, 0..100)) { + if $cond { + $crate::frame::tanh::test::test_tanh::<$ker, $t>(&*xs).unwrap() + } + } + } + + #[test] + fn tanh_4_magic() { + if $cond { + $crate::frame::tanh::test::test_tanh::<$ker, $t>(&[0f32, -20.0, 20.0, 0.0]) + .unwrap() + } + } + + #[test] + fn tanh_4zeros() { + if $cond { + $crate::frame::tanh::test::test_tanh::<$ker, $t>(&[0.0; 4]).unwrap(); + } + } + + #[test] + fn tanh_20_ones() { + if $cond { + $crate::frame::tanh::test::test_tanh::<$ker, $t>(&[1.0; 20]).unwrap(); + } + } + + #[test] + fn tanh_18_zeros() { + if $cond { + $crate::frame::tanh::test::test_tanh::<$ker, $t>(&[0.0; 18]).unwrap(); + } + } + + #[test] + fn tanh_foo() { + if $cond { + $crate::frame::tanh::test::test_tanh::<$ker, $t>(&[0.67503357]).unwrap(); + } + } + + #[test] + fn tanh_asymptots() { + use tract_data::internal::*; + use $crate::frame::element_wise::*; + if $cond { + let mut input: Vec<$t> = [-100f32, 100f32] + .iter() + .map(|x| >::as_(*x)) + .collect(); + let expected: Vec<$t> = [-1f32, 1f32] + .iter() + .map(|x| >::as_(*x)) + .collect(); + <$ker>::ew().run(&mut input).unwrap(); + tensor1(&input) + .close_enough(&tensor1(&expected), Approximation::Close) + .unwrap(); + } + } + }; + } + + pub fn test_tanh, T: LADatum + Float>(values: &[f32]) -> TestCaseResult + where + f32: AsPrimitive, + { + crate::setup_test_logger(); + let values: Vec = values.iter().copied().map(|x| x.as_()).collect(); + crate::frame::element_wise::test::test_element_wise::(&values, |x| x.tanh()) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/unicast.rs b/vendor/tract-linalg-0.22.1/src/frame/unicast.rs new file mode 100644 index 000000000..fca39f7a5 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/unicast.rs @@ -0,0 +1,233 @@ +use std::fmt::Debug; +use std::marker::PhantomData; + +use tract_data::internal::TensorView; +use tract_data::TractResult; + +use crate::frame::element_wise_helper::TempBuffer; +use crate::{LADatum, LinalgFn}; + +macro_rules! unicast_impl_wrap { + ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $run: item) => { + paste! { + #[derive(Copy, Clone, Debug)] + #[allow(non_camel_case_types)] + pub struct $func; + + impl crate::frame::unicast::UnicastKer<$ti> for $func { + #[inline(always)] + fn name() -> &'static str { + stringify!($func) + } + #[inline(always)] + fn nr() -> usize { + $nr + } + #[inline(always)] + fn alignment_items() -> usize { + $alignment_items + } + $run + } + } + }; +} + +pub trait Unicast: Send + Sync + Debug + dyn_clone::DynClone +where + T: Copy + Debug + PartialEq + Send + Sync, +{ + fn name(&self) -> &'static str; + fn run(&self, a: &mut [T], b: &[T]) -> TractResult<()>; +} + +dyn_clone::clone_trait_object!( Unicast where T: Copy); + +#[derive(Debug, Clone, new)] +pub struct UnicastImpl +where + T: LADatum, + K: UnicastKer + Clone, +{ + phantom: PhantomData<(K, T)>, +} + +impl UnicastImpl +where + T: LADatum, + K: UnicastKer + Clone, +{ +} +impl Unicast for UnicastImpl +where + T: LADatum, + K: UnicastKer + Clone, +{ + fn name(&self) -> &'static str { + K::name() + } + fn run(&self, a: &mut [T], b: &[T]) -> TractResult<()> { + unicast_with_alignment(a, b, |a, b| K::run(a, b), K::nr(), K::alignment_bytes()) + } +} + +pub trait UnicastKer: Send + Sync + Debug + dyn_clone::DynClone + Clone + 'static +where + T: LADatum, +{ + fn name() -> &'static str; + fn alignment_bytes() -> usize { + Self::alignment_items() * T::datum_type().size_of() + } + fn alignment_items() -> usize; + fn nr() -> usize; + fn run(a: &mut [T], b: &[T]); + fn bin() -> Box { + Box::new(|a: &mut TensorView, b: &TensorView| { + let a_slice = a.as_slice_mut()?; + let b_slice = b.as_slice()?; + UnicastImpl::::new().run(a_slice, b_slice) + }) + } +} + +std::thread_local! { + static TMP: std::cell::RefCell<(TempBuffer, TempBuffer)> = std::cell::RefCell::new((TempBuffer::default(), TempBuffer::default())); +} + +pub(crate) fn unicast_with_alignment( + a: &mut [T], + b: &[T], + f: impl Fn(&mut [T], &[T]), + nr: usize, + alignment_bytes: usize, +) -> TractResult<()> +where + T: LADatum, +{ + if a.is_empty() { + return Ok(()); + } + unsafe { + TMP.with(|buffers| { + let mut buffers = buffers.borrow_mut(); + buffers.0.ensure(nr * T::datum_type().size_of(), alignment_bytes); + buffers.1.ensure(nr * T::datum_type().size_of(), alignment_bytes); + let tmp_a = std::slice::from_raw_parts_mut(buffers.0.buffer as *mut T, nr); + let tmp_b = std::slice::from_raw_parts_mut(buffers.1.buffer as *mut T, nr); + let mut compute_via_temp_buffer = |a: &mut [T], b: &[T]| { + tmp_a[..a.len()].copy_from_slice(a); + tmp_b[..b.len()].copy_from_slice(b); + f(tmp_a, tmp_b); + a.copy_from_slice(&tmp_a[..a.len()]) + }; + + let mut num_element_processed = 0; + let a_prefix_len = a.as_ptr().align_offset(alignment_bytes).min(a.len()); + let b_prefix_len = b.as_ptr().align_offset(alignment_bytes).min(b.len()); + assert!( + a_prefix_len == b_prefix_len, + "Both inputs should be of the same alignement, got {a_prefix_len:?}, {b_prefix_len:?}" + ); + let mut applied_prefix_len = 0; + if a_prefix_len > 0 { + // Incomplete tile needs to be created to process unaligned data. + let sub_a = &mut a[..a_prefix_len]; + let sub_b = &b[..a_prefix_len]; + compute_via_temp_buffer(sub_a, sub_b); + num_element_processed += a_prefix_len; + applied_prefix_len = a_prefix_len; + } + + let num_complete_tiles = (a.len() - applied_prefix_len) / nr; + if num_complete_tiles > 0 { + // Process all tiles that are complete. + let sub_a = &mut a[applied_prefix_len..][..(num_complete_tiles * nr)]; + let sub_b = &b[applied_prefix_len..][..(num_complete_tiles * nr)]; + f(sub_a, sub_b); + num_element_processed += num_complete_tiles * nr; + } + + if num_element_processed < a.len() { + // Incomplete tile needs to be created to process remaining elements. + compute_via_temp_buffer( + &mut a[num_element_processed..], + &b[num_element_processed..], + ); + } + }) + } + Ok(()) +} + +#[cfg(test)] +#[macro_use] +pub mod test { + use super::*; + use crate::LADatum; + use proptest::test_runner::{TestCaseError, TestCaseResult}; + use tract_data::internal::*; + use tract_num_traits::{AsPrimitive, Float}; + + pub fn test_unicast, T: LADatum>( + a: &mut [T], + b: &[T], + reference: impl Fn(T, T) -> T, + ) -> TestCaseResult { + crate::setup_test_logger(); + let op = UnicastImpl::::new(); + let expected = a.iter().zip(b.iter()).map(|(a, b)| (reference)(*a, *b)).collect::>(); + op.run(a, b).unwrap(); + tensor1(a) + .close_enough(&tensor1(&expected), true) + .map_err(|e| TestCaseError::fail(e.root_cause().to_string()))?; + Ok(()) + } + + pub fn test_unicast_t, T: LADatum + Float>( + a: &[f32], + b: &[f32], + func: impl Fn(T, T) -> T, + ) -> TestCaseResult + where + f32: AsPrimitive, + { + crate::setup_test_logger(); + let vec_a: Vec = a.iter().copied().map(|x| x.as_()).collect(); + // We allocate a tensor to ensure allocation is done with alignement + let mut a = unsafe { Tensor::from_slice_align(vec_a.as_slice(), vector_size()).unwrap() }; + let vec_b: Vec = b.iter().copied().map(|x| x.as_()).collect(); + // We allocate a tensor to ensure allocation is done with alignement + let b = unsafe { Tensor::from_slice_align(vec_b.as_slice(), vector_size()).unwrap() }; + crate::frame::unicast::test::test_unicast::( + a.as_slice_mut::().unwrap(), + b.as_slice::().unwrap(), + func, + ) + } + + #[macro_export] + macro_rules! unicast_frame_tests { + ($cond:expr, $t: ty, $ker:ty, $func:expr) => { + pastey::paste! { + proptest::proptest! { + #[test] + fn []( + (a, b) in (0..100_usize).prop_flat_map(|len| (vec![-25f32..25.0; len], vec![-25f32..25.0; len])) + ) { + if $cond { + $crate::frame::unicast::test::test_unicast_t::<$ker, $t>(&*a, &*b, $func).unwrap() + } + } + } + + #[test] + fn []() { + if $cond { + $crate::frame::unicast::test::test_unicast_t::<$ker, $t>(&[], &[], $func).unwrap() + } + } + } + }; + } +} diff --git a/vendor/tract-linalg-0.22.1/src/frame/weights.rs b/vendor/tract-linalg-0.22.1/src/frame/weights.rs new file mode 100644 index 000000000..527893090 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/frame/weights.rs @@ -0,0 +1,80 @@ +use std::fmt::Debug; +use tract_data::prelude::DatumType; + +use crate::block_quant::{BlockQuant, PackedBlockQuantFormat}; + +use crate::mmm::MMMInputFormat; +use crate::pack::PackedFormat; + +#[derive(Clone)] +pub enum WeightType { + Plain(DatumType), + BlockQuant(Box), +} + +impl From for WeightType { + fn from(value: DatumType) -> Self { + match value { + DatumType::F16 => WeightType::Plain(DatumType::F16), + DatumType::F32 => WeightType::Plain(DatumType::F32), + DatumType::F64 => WeightType::Plain(DatumType::F64), + DatumType::I32 => WeightType::Plain(DatumType::I32), + DatumType::I8 | DatumType::QI8(_) => WeightType::Plain(DatumType::I8), + DatumType::U8 | DatumType::QU8(_) => WeightType::Plain(DatumType::U8), + _ => panic!("Can't build a WeightType from {value:?}"), + } + } +} + +impl From> for WeightType { + fn from(value: Box) -> Self { + (&*value).into() + } +} + +impl From<&dyn MMMInputFormat> for WeightType { + fn from(value: &dyn MMMInputFormat) -> Self { + if let Some(pf) = value.downcast_ref::() { + WeightType::Plain(pf.dt) + } else if let Some(pbqf) = value.downcast_ref::() { + WeightType::BlockQuant(dyn_clone::clone_box(&*pbqf.bq)) + } else { + todo!() + } + } +} + +impl PartialEq for WeightType { + fn eq(&self, other: &Self) -> bool { + use WeightType::*; + match (self, other) { + (Plain(a), Plain(b)) => a == b, + (BlockQuant(a), BlockQuant(b)) => a.same_as(&**b), + _ => false, + } + } +} + +impl From for WeightType { + fn from(value: BQ) -> Self { + WeightType::BlockQuant(dyn_clone::clone_box(&value)) + } +} + +impl Debug for WeightType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Plain(p) => write!(f, "{p:?}"), + Self::BlockQuant(bq) => write!(f, "{bq:?}"), + } + } +} + +impl WeightType { + pub fn as_dt(&self) -> Option { + match self { + WeightType::Plain(dt) => Some(*dt), + _ => None, + } + } +} diff --git a/vendor/tract-linalg-0.22.1/src/generic.rs b/vendor/tract-linalg-0.22.1/src/generic.rs new file mode 100644 index 000000000..f2030ff0b --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/generic.rs @@ -0,0 +1,55 @@ +pub mod by_scalar; +pub mod erf; +pub mod leaky_relu; +pub mod lut; +pub mod mmm; +pub mod reduce; +pub mod rounding; +pub mod sigmoid; +pub mod tanh; +pub mod unicast; + +use tract_data::prelude::DatumType; + +use crate::by_scalar::ByScalarKer; +use crate::unicast::UnicastKer; +use crate::{BinOp, LinalgRegistry}; + +pub use self::by_scalar::{HMulByScalar8, SMulByScalar4}; +pub use self::erf::SErf4; +pub use self::leaky_relu::{HLeakyRelu8, SLeakyRelu4}; +pub use self::lut::GenericLut8; +pub use self::reduce::softmax_l2::SSoftMaxL2; +pub use self::rounding::{ScaleShiftAndRound, Scaler}; +pub use self::sigmoid::{HSigmoid8, SSigmoid4}; +pub use self::tanh::{HTanh8, STanh4}; + +pub(crate) fn register_all_unicast(registry: &mut LinalgRegistry) { + registry.insert((BinOp::Mul, DatumType::F32), Box::new(|| unicast::SUnicastMul4::bin())); + registry.insert((BinOp::Mul, DatumType::F16), Box::new(|| unicast::HUnicastMul8::bin())); + registry.insert((BinOp::Add, DatumType::F32), Box::new(|| unicast::SUnicastAdd4::bin())); + registry.insert((BinOp::Add, DatumType::F16), Box::new(|| unicast::HUnicastAdd8::bin())); + registry.insert((BinOp::Sub, DatumType::F32), Box::new(|| unicast::SUnicastSub4::bin())); + registry.insert((BinOp::Sub, DatumType::F16), Box::new(|| unicast::HUnicastSub8::bin())); + registry.insert((BinOp::SubF, DatumType::F32), Box::new(|| unicast::SUnicastSubF4::bin())); + registry.insert((BinOp::SubF, DatumType::F16), Box::new(|| unicast::HUnicastSubF8::bin())); + registry.insert((BinOp::Min, DatumType::F32), Box::new(|| unicast::SUnicastMin4::bin())); + registry.insert((BinOp::Min, DatumType::F16), Box::new(|| unicast::HUnicastMin8::bin())); + registry.insert((BinOp::Max, DatumType::F32), Box::new(|| unicast::SUnicastMax4::bin())); + registry.insert((BinOp::Max, DatumType::F16), Box::new(|| unicast::HUnicastMax8::bin())); +} + +pub(crate) fn register_all_by_scalar(registry: &mut LinalgRegistry) { + registry.insert((BinOp::Mul, DatumType::F32), Box::new(|| by_scalar::SMulByScalar4::bin())); + registry.insert((BinOp::Mul, DatumType::F16), Box::new(|| by_scalar::HMulByScalar8::bin())); + registry.insert((BinOp::Add, DatumType::F32), Box::new(|| by_scalar::SAddByScalar4::bin())); + registry.insert((BinOp::Add, DatumType::F16), Box::new(|| by_scalar::HAddByScalar8::bin())); + registry.insert((BinOp::Sub, DatumType::F32), Box::new(|| by_scalar::SSubByScalar4::bin())); + registry.insert((BinOp::Sub, DatumType::F16), Box::new(|| by_scalar::HSubByScalar8::bin())); + registry.insert((BinOp::SubF, DatumType::F32), Box::new(|| by_scalar::SSubFByScalar4::bin())); + registry.insert((BinOp::SubF, DatumType::F16), Box::new(|| by_scalar::HSubFByScalar8::bin())); + registry.insert((BinOp::Min, DatumType::F32), Box::new(|| by_scalar::SMinByScalar4::bin())); + registry.insert((BinOp::Min, DatumType::F16), Box::new(|| by_scalar::HMinByScalar8::bin())); + registry.insert((BinOp::Max, DatumType::F32), Box::new(|| by_scalar::SMaxByScalar4::bin())); + registry.insert((BinOp::Max, DatumType::F16), Box::new(|| by_scalar::HMaxByScalar8::bin())); +} diff --git a/vendor/tract-linalg-0.22.1/src/generic/by_scalar.rs b/vendor/tract-linalg-0.22.1/src/generic/by_scalar.rs new file mode 100644 index 000000000..1aaab592f --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/generic/by_scalar.rs @@ -0,0 +1,181 @@ +use tract_data::internal::f16; + +by_scalar_impl_wrap!( + f32, + SMulByScalar4, + 4, + 4, + f32, + fn run(x: &mut [f32], s: f32) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px *= s) + } +); + +by_scalar_impl_wrap!( + f32, + SAddByScalar4, + 4, + 4, + f32, + fn run(x: &mut [f32], s: f32) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px += s) + } +); + +by_scalar_impl_wrap!( + f32, + SSubByScalar4, + 4, + 4, + f32, + fn run(x: &mut [f32], s: f32) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px -= s) + } +); + +by_scalar_impl_wrap!( + f32, + SSubFByScalar4, + 4, + 4, + f32, + fn run(x: &mut [f32], s: f32) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px = s - *px) + } +); + +by_scalar_impl_wrap!( + f32, + SMinByScalar4, + 4, + 4, + f32, + fn run(x: &mut [f32], s: f32) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px = px.min(s)) + } +); + +by_scalar_impl_wrap!( + f32, + SMaxByScalar4, + 4, + 4, + f32, + fn run(x: &mut [f32], s: f32) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px = px.max(s)) + } +); + +#[cfg(test)] +#[macro_use] +pub mod mul_by_scalar_f32 { + use super::*; + by_scalar_frame_tests!(true, f32, SMulByScalar4, |a, b| a * b); + by_scalar_frame_tests!(true, f32, SAddByScalar4, |a, b| a + b ); + by_scalar_frame_tests!(true, f32, SSubByScalar4, |a, b| a - b); + by_scalar_frame_tests!(true, f32, SSubFByScalar4, |a, b| b - a); + by_scalar_frame_tests!(true, f32, SMinByScalar4, |a, b| a.min(b)); + by_scalar_frame_tests!(true, f32, SMaxByScalar4, |a, b| a.max(b)); +} + +by_scalar_impl_wrap!( + f16, + HMulByScalar8, + 8, + 8, + f16, + fn run(x: &mut [f16], s: f16) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px *= s) + } +); + +by_scalar_impl_wrap!( + f16, + HAddByScalar8, + 8, + 8, + f16, + fn run(x: &mut [f16], s: f16) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px += s) + } +); + +by_scalar_impl_wrap!( + f16, + HSubByScalar8, + 8, + 8, + f16, + fn run(x: &mut [f16], s: f16) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px -= s) + } +); + +by_scalar_impl_wrap!( + f16, + HSubFByScalar8, + 8, + 8, + f16, + fn run(x: &mut [f16], s: f16) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px = s - *px) + } +); + +by_scalar_impl_wrap!( + f16, + HMinByScalar8, + 8, + 8, + f16, + fn run(x: &mut [f16], s: f16) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px = px.min(s)) + } +); + +by_scalar_impl_wrap!( + f16, + HMaxByScalar8, + 8, + 8, + f16, + fn run(x: &mut [f16], s: f16) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px = px.max(s)) + } +); + +#[cfg(test)] +#[macro_use] +pub mod mul_by_scalar_f16 { + use super::*; + by_scalar_frame_tests!(true, f16, HMulByScalar8, |a, b| a * b); + by_scalar_frame_tests!(true, f16, HAddByScalar8, |a, b| a + b); + by_scalar_frame_tests!(true, f16, HSubByScalar8, |a, b| a - b); + by_scalar_frame_tests!(true, f16, HSubFByScalar8, |a, b| b - a); + by_scalar_frame_tests!(true, f16, HMinByScalar8, |a, b| a.min(b)); + by_scalar_frame_tests!(true, f16, HMaxByScalar8, |a, b| a.max(b)); +} diff --git a/vendor/tract-linalg-0.22.1/src/generic/erf.rs b/vendor/tract-linalg-0.22.1/src/generic/erf.rs new file mode 100644 index 000000000..8f4cdaf43 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/generic/erf.rs @@ -0,0 +1,51 @@ +use crate::element_wise::ElementWiseKer; + +#[allow(non_upper_case_globals)] +#[allow(clippy::excessive_precision)] +fn serf(x: &mut f32) { + const a1: f32 = 0.0705230784; + const a2: f32 = 0.0422820123; + const a3: f32 = 0.0092705272; + const a4: f32 = 0.0001520143; + const a5: f32 = 0.0002765672; + const a6: f32 = 0.0000430638; + + let signum = x.signum(); + let abs = x.abs(); + let y = a6 * abs; + let y = (a5 + y) * abs; + let y = (a4 + y) * abs; + let y = (a3 + y) * abs; + let y = (a2 + y) * abs; + let y = (a1 + y) * abs; + let y = 1.0 - (y + 1.0).powi(16).recip(); + + *x = y.copysign(signum) +} + +#[derive(Clone, Debug)] +pub struct SErf4; + +impl ElementWiseKer for SErf4 { + fn name() -> &'static str { + "generic" + } + + fn alignment_items() -> usize { + 16 + } + + fn alignment_bytes() -> usize { + 16 + } + + fn nr() -> usize { + 4 + } + + fn run(x: &mut [f32], _: ()) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(serf) + } +} diff --git a/vendor/tract-linalg-0.22.1/src/generic/leaky_relu.rs b/vendor/tract-linalg-0.22.1/src/generic/leaky_relu.rs new file mode 100644 index 000000000..0526319cd --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/generic/leaky_relu.rs @@ -0,0 +1,74 @@ +#![allow(clippy::excessive_precision)] +use crate::frame::element_wise::ElementWiseKer; +use tract_data::internal::*; +use tract_num_traits::Zero; + +#[derive(Clone, Debug)] +pub struct SLeakyRelu4; + +impl ElementWiseKer for SLeakyRelu4 { + fn name() -> &'static str { + "generic" + } + + fn alignment_bytes() -> usize { + 16 + } + + fn alignment_items() -> usize { + 4 + } + + fn nr() -> usize { + 4 + } + + fn run(x: &mut [f32], alpha: f32) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px = if *px < 0. { *px * alpha } else { *px }); + } +} + +#[derive(Clone, Debug)] +pub struct HLeakyRelu8; + +impl ElementWiseKer for HLeakyRelu8 { + fn name() -> &'static str { + "generic" + } + + fn alignment_bytes() -> usize { + 16 + } + + fn alignment_items() -> usize { + 4 + } + + fn nr() -> usize { + 8 + } + + fn run(x: &mut [f16], alpha: f16) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px = if *px < f16::zero() { *px * alpha } else { *px }) + } +} + +#[cfg(test)] +#[macro_use] +pub mod s { + leaky_relu_frame_tests!(true, f32, crate::generic::leaky_relu::SLeakyRelu4); +} + +#[cfg(test)] +#[macro_use] +pub mod h { + leaky_relu_frame_tests!( + true, + tract_data::internal::f16, + crate::generic::leaky_relu::HLeakyRelu8 + ); +} diff --git a/vendor/tract-linalg-0.22.1/src/generic/lut.rs b/vendor/tract-linalg-0.22.1/src/generic/lut.rs new file mode 100644 index 000000000..b9023bd6c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/generic/lut.rs @@ -0,0 +1,47 @@ +use crate::frame::lut::LutKer; + +#[derive(Clone, Debug, Hash)] +pub struct GenericLut8; + +impl LutKer for GenericLut8 { + fn name() -> &'static str { + "generic" + } + + fn input_alignment_bytes() -> usize { + 1 + } + + fn table_alignment_bytes() -> usize { + 1 + } + + fn n() -> usize { + 8 + } + + unsafe fn run(buf: *mut u8, len: usize, table: *const u8) { + unsafe { + debug_assert!(len % Self::n() == 0); + debug_assert!(buf as usize % Self::input_alignment_bytes() == 0); + debug_assert!(table as usize % Self::table_alignment_bytes() == 0); + for i in 0..((len / 8) as isize) { + let ptr = buf.offset(8 * i); + *ptr.offset(0) = *table.offset(*ptr.offset(0) as isize); + *ptr.offset(1) = *table.offset(*ptr.offset(1) as isize); + *ptr.offset(2) = *table.offset(*ptr.offset(2) as isize); + *ptr.offset(3) = *table.offset(*ptr.offset(3) as isize); + *ptr.offset(4) = *table.offset(*ptr.offset(4) as isize); + *ptr.offset(5) = *table.offset(*ptr.offset(5) as isize); + *ptr.offset(6) = *table.offset(*ptr.offset(6) as isize); + *ptr.offset(7) = *table.offset(*ptr.offset(7) as isize); + } + } + } +} + +#[cfg(test)] +#[macro_use] +pub mod test { + lut_frame_tests!(true, crate::generic::GenericLut8); +} diff --git a/vendor/tract-linalg-0.22.1/src/generic/mmm.rs b/vendor/tract-linalg-0.22.1/src/generic/mmm.rs new file mode 100644 index 000000000..28c3bbd86 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/generic/mmm.rs @@ -0,0 +1,453 @@ +#![allow(clippy::needless_range_loop)] +use num_traits::AsPrimitive; + +use tract_data::prelude::f16; +use tract_data::prelude::*; + +use super::*; +use crate::frame::block_quant::{BlockQuant, NibbleReader, PackedBlockQuantFormat, Q4_0}; +use crate::frame::mmm::*; +use crate::{has_fp16, LADatum, Ops}; + +macro_rules! scalar { + ($ab: expr, $m: expr, $f: expr) => { + for i in 0..$ab.len() { + for j in 0..$ab[0].len() { + $ab[i][j] = $f($m, $ab[i][j]) + } + } + }; +} + +macro_rules! per_row { + ($ab: expr, $m: expr, $f: expr) => { + for i in 0..$ab.len() { + for j in 0..$ab[0].len() { + $ab[i][j] = $f(*$m.add(i), $ab[i][j]) + } + } + }; +} + +macro_rules! per_col { + ($ab: expr, $m: expr, $f: expr) => { + for i in 0..$ab.len() { + for j in 0..$ab[0].len() { + $ab[i][j] = $f(*$m.add(j), $ab[i][j]) + } + } + }; +} + +unsafe fn add_mat_mul( + pa: *const u8, + pb: *const u8, + k: usize, + ab: &mut [[TI; NR]; MR], +) where + TA: LADatum + AsPrimitive, + TB: LADatum + AsPrimitive, + TI: LADatum, +{ + unsafe { + let a = pa as *const TA; + let b = pb as *const TB; + for ik in 0..k { + let a = std::slice::from_raw_parts(a.add(MR * ik), MR); + let b = std::slice::from_raw_parts(b.add(NR * ik), NR); + for i in 0..MR { + for j in 0..NR { + ab[i][j] += a[i].as_() * b[j].as_(); + } + } + } + } +} + +unsafe fn add_mat_mul_pq40( + pa: *const u8, + pb: *const u8, + k: usize, + ab: &mut [[TI; NR]; MR], +) where + TI: LADatum, + f16: AsPrimitive, + TB: AsPrimitive, + i8: AsPrimitive, +{ + unsafe { + assert!(k % Q4_0.block_len() == 0); + let len = (k * MR) / Q4_0.block_len() * Q4_0.block_bytes(); + let mut pa = NibbleReader::for_slice(std::slice::from_raw_parts(pa, len)); + let b = pb as *const TB; + for bk in 0..k / 32 { + let mut scales: [TI; MR] = [TI::zero(); MR]; + scales.iter_mut().for_each(|x| *x = pa.read_f16().as_()); + for ik in 0..32 { + let mut a: [TI; MR] = [TI::zero(); MR]; + a.iter_mut().zip(&scales).for_each(|(x, s)| *x = *s * (pa.read_i4() - 8).as_()); + let b = std::slice::from_raw_parts(b.add(NR * (ik + 32 * bk)), NR); + for i in 0..MR { + for j in 0..NR { + ab[i][j] += a[i] * b[j].as_(); + } + } + } + } + } +} + +unsafe fn add_mat_mul_pq40_scales_at_end( + pa: *const u8, + pb: *const u8, + k: usize, + ab: &mut [[TI; NR]; MR], +) where + TI: LADatum, + f16: AsPrimitive, + TB: AsPrimitive, + i8: AsPrimitive, +{ + unsafe { + assert!(k % Q4_0.block_len() == 0); + let len = (k * MR) / Q4_0.block_len() * Q4_0.block_bytes(); + let mut pa = NibbleReader::for_slice(std::slice::from_raw_parts(pa, len)); + let b = pb as *const TB; + for bk in 0..k / 32 { + let mut temp = [[TI::zero(); NR]; MR]; + for ik in 0..32 { + let mut a: [TI; MR] = [TI::zero(); MR]; + a.iter_mut().for_each(|x| *x = (pa.read_i4() - 8).as_()); + let b = std::slice::from_raw_parts(b.add(NR * (ik + 32 * bk)), NR); + for i in 0..MR { + for j in 0..NR { + temp[i][j] += a[i] * b[j].as_(); + } + } + } + for i in 0..MR { + let scale = pa.read_f16().as_(); + for j in 0..NR { + ab[i][j] += temp[i][j] * scale; + } + } + } + } +} + +unsafe fn add_unicast( + ab: &mut [[TI; NR]; MR], + other: &OutputStoreKer, +) where + TI: LADatum, + TO: LADatum + AsPrimitive, +{ + unsafe { + for i in 0usize..MR { + for j in 0usize..NR { + let value: *const TO = other + .ptr + .offset(other.row_byte_stride * i as isize + other.col_byte_stride * j as isize) + as _; + ab[i].as_mut()[j] += (*value).as_(); + } + } + } +} + +unsafe fn store_t( + tile: &OutputStoreKer, + ab: &[[TI; NR]; MR], +) where + TC: Copy, +{ + unsafe { + for i in 0usize..MR { + for j in 0usize..NR { + let loc: *mut TC = tile + .ptr + .offset(tile.row_byte_stride * i as isize + tile.col_byte_stride * j as isize) + as _; + let val: *const TC = (&ab[i].as_ref()[j]) as *const TI as _; + *loc = *val + } + } + } +} + +unsafe fn store_float_t( + tile: &OutputStoreKer, + ab: &[[TI; NR]; MR], +) where + TC: Copy + 'static, + TI: Copy + 'static + AsPrimitive, +{ + unsafe { + for i in 0usize..MR { + for j in 0usize..NR { + let loc: *mut TC = tile + .ptr + .offset(tile.row_byte_stride * i as isize + tile.col_byte_stride * j as isize) + as _; + let val = ab[i].as_ref()[j].as_(); + *loc = val + } + } + } +} + +#[inline(never)] +unsafe fn kernel(mut pnl: *const FusedKerSpec) -> isize +where + TI: LADatum + ScaleShiftAndRound + AsPrimitive, + TI: AsPrimitive + AsPrimitive + AsPrimitive, + usize: AsPrimitive, + f16: AsPrimitive, + f32: AsPrimitive, + f64: AsPrimitive, + i8: AsPrimitive, + i32: AsPrimitive, +{ + unsafe { + let mut ab = [[TI::zero(); NR]; MR]; + loop { + if pnl.is_null() { + break; + } + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => ab = std::mem::zeroed(), + FusedKerSpec::LoadTile(col_major, _row_major) => { + for row in 0..MR { + for col in 0..NR { + ab[row][col] = *col_major.add(col * MR + row); + } + } + } + FusedKerSpec::ScalarAdd(a) => scalar!(ab, a, |a, b| a + b), + FusedKerSpec::ScalarMul(a) => scalar!(ab, a, |a, b| a * b), + FusedKerSpec::ScalarMin(m) => scalar!(ab, m, |a, b| if a < b { a } else { b }), + FusedKerSpec::ScalarMax(m) => scalar!(ab, m, |a, b| if a > b { a } else { b }), + FusedKerSpec::ScalarSub(m) => scalar!(ab, m, |a, b| a - b), + FusedKerSpec::ScalarSubF(m) => scalar!(ab, m, |a, b| b - a), + FusedKerSpec::LeakyRelu(m) => { + scalar!(ab, m, |a, b| if b > TI::zero() { b } else { a * b }) + } + FusedKerSpec::PerRowMin(m) => per_row!(ab, m, |a, b| if a < b { a } else { b }), + FusedKerSpec::PerRowMax(m) => per_row!(ab, m, |a, b| if a > b { a } else { b }), + FusedKerSpec::PerRowAdd(m) => per_row!(ab, m, |a, b| a + b), + FusedKerSpec::PerRowMul(m) => per_row!(ab, m, |a, b| a * b), + FusedKerSpec::PerRowSub(m) => per_row!(ab, m, |a, b| a - b), + FusedKerSpec::PerRowSubF(m) => per_row!(ab, m, |a, b| b - a), + FusedKerSpec::PerColMin(m) => per_col!(ab, m, |a, b| if a < b { a } else { b }), + FusedKerSpec::PerColMax(m) => per_col!(ab, m, |a, b| if a > b { a } else { b }), + FusedKerSpec::PerColAdd(m) => per_col!(ab, m, |a, b| a + b), + FusedKerSpec::PerColMul(m) => per_col!(ab, m, |a, b| a * b), + FusedKerSpec::PerColSub(m) => per_col!(ab, m, |a, b| a - b), + FusedKerSpec::PerColSubF(m) => per_col!(ab, m, |a, b| b - a), + FusedKerSpec::AddRowColProducts(rows, cols) => { + for i in 0..MR { + for j in 0..NR { + ab[i][j] += *rows.add(i) * *cols.add(j); + } + } + } + FusedKerSpec::AddUnicast(other) => { + if TI::datum_type().is_float() && other.item_size == 2 { + add_unicast::(&mut ab, &other) + } else if TI::datum_type().is_float() && other.item_size == 4 { + add_unicast::(&mut ab, &other) + } else if TI::datum_type().is_float() && other.item_size == 8 { + add_unicast::(&mut ab, &other) + } else if TI::datum_type() == i32::datum_type() && other.item_size == 1 { + add_unicast::(&mut ab, &other) + } else if TI::datum_type() == i32::datum_type() && other.item_size == 4 { + add_unicast::(&mut ab, &other) + } else { + unimplemented!("Missing AddUnicast type"); + } + } + FusedKerSpec::ShiftLeft(shift) => { + for i in 0..MR { + for j in 0..NR { + ab[i][j] = ab[i][j].q_shl(shift); + } + } + } + FusedKerSpec::RoundingShiftRight(shift, rp) => { + for i in 0..MR { + for j in 0..NR { + ab[i][j] = ab[i][j].q_shr(shift, rp); + } + } + } + FusedKerSpec::QScale(shift, rp, mult) => { + for i in 0..MR { + for j in 0..NR { + ab[i][j] = ab[i][j].q_scale(Scaler::from_fuse_params(shift, rp, mult)); + } + } + } + FusedKerSpec::AddMatMul { k, pa, pb, packing } => { + use std::mem::transmute; + if TI::datum_type().is_float() { + match packing { + 0 => add_mat_mul::(pa, pb, k, &mut ab), + 1 => add_mat_mul::(pa, pb, k, &mut ab), + 2 => add_mat_mul::(pa, pb, k, &mut ab), + 3 => add_mat_mul::(pa, pb, k, &mut ab), + 4 => add_mat_mul::(pa, pb, k, &mut ab), + 5 => add_mat_mul_pq40::(pa, pb, k, &mut ab), + 6 => add_mat_mul_pq40_scales_at_end::( + pa, pb, k, &mut ab, + ), + 7 => add_mat_mul_pq40::(pa, pb, k, &mut ab), + _ => unreachable!(), + } + } else if TI::datum_type() == i32::datum_type() { + // transmute to allow using explicitly i3 in add_mat_mul generic params + let ab = transmute::<&mut [[TI; NR]; MR], &mut [[i32; NR]; MR]>(&mut ab); + if packing == 0 { + add_mat_mul::(pa, pb, k, ab) + } else if packing == 1 { + add_mat_mul::(pa, pb, k, ab) + } else { + return 1; + } + } else { + return 1; + } + } + FusedKerSpec::Store(tile) => { + if TI::datum_type().is_float() { + match tile.item_size { + 2 => store_float_t::(&tile, &ab), + 4 => store_float_t::(&tile, &ab), + 8 => store_float_t::(&tile, &ab), + _ => unimplemented!(), + } + } else { + match tile.item_size { + 1 => store_t::(&tile, &ab), + 2 => store_t::(&tile, &ab), + 4 => store_t::(&tile, &ab), + 8 => store_t::(&tile, &ab), + _ => unimplemented!(), + } + } + } + }; + pnl = pnl.add(1); + } + } + 0 +} + +fn pq40_r4() -> PackedBlockQuantFormat { + PackedBlockQuantFormat::new(&Q4_0, 4, 0, false) +} + +fn pq40_r4_se() -> PackedBlockQuantFormat { + PackedBlockQuantFormat::new(&Q4_0, 4, 0, true) +} + +// f16 kernels +MMMRustKernel!(kernel:: => generic_f16_4x4(4,4) + packing[1] = f16f16bis => |k| k.with_packing(f16::packing(4), f16::packing(4)); + packing[2] = f32f32 => |k| k.with_packing(f32::packing(4), f32::packing(4)); + packing[3] = f16f32 => |k| k.with_packing(f16::packing(4), f32::packing(4)); + packing[4] = f32f16 => |k| k.with_packing(f32::packing(4), f16::packing(4)); + packing[5] = q40f16 => |k| k.with_packing(pq40_r4(), f16::packing(4)); + packing[6] = q40f16se => |k| k.with_packing(pq40_r4_se(), f16::packing(4)); + packing[7] = q40f32 => |k| k.with_packing(pq40_r4(), f32::packing(4)); + quality(if has_fp16() { ImplementationQuality::Generic } else { ImplementationQuality::Dreadful }) + store(f32, f64) +); + +MMMRustKernel! {kernel:: => generic_f16_4x1(4,1) + packing[1] = f16f16bis => |k| k.with_packing(f16::packing(4), f16::packing(1)); + packing[2] = f32f32 => |k| k.with_packing(f32::packing(4), f32::packing(1)); + packing[3] = f16f32 => |k| k.with_packing(f16::packing(4), f32::packing(1)); + packing[4] = f32f16 => |k| k.with_packing(f32::packing(4), f16::packing(1)); + packing[5] = q40f16 => |k| k.with_packing(pq40_r4(), f16::packing(1)); + packing[6] = q40f16se => |k| k.with_packing(pq40_r4_se(), f16::packing(1)); + packing[7] = q40f32 => |k| k.with_packing(pq40_r4(), f32::packing(1)); + quality(if has_fp16() { ImplementationQuality::Generic } else { ImplementationQuality::Dreadful }) + store(f32, f64) +} + +// f32 kernels +MMMRustKernel!(kernel:: => generic_f32_4x4(4,4) + packing[1] = f16f16 => |k| k.with_packing(f16::packing(4), f16::packing(4)); + packing[2] = f32f32bis => |k| k.with_packing(f32::packing(4), f32::packing(4)); + packing[3] = f16f32 => |k| k.with_packing(f16::packing(4), f32::packing(4)); + packing[4] = f32f16 => |k| k.with_packing(f32::packing(4), f16::packing(4)); + packing[5] = q40f16 => |k| k.with_packing(pq40_r4(), f16::packing(4)); + packing[6] = q40f16se => |k| k.with_packing(pq40_r4_se(), f16::packing(4)); + packing[7] = q40f32 => |k| k.with_packing(pq40_r4(), f32::packing(4)); + quality(ImplementationQuality::Generic) + store(f16, f64) +); +MMMRustKernel! {kernel:: => generic_f32_4x1(4,1) + packing[1] = f16f16 => |k| k.with_packing(f16::packing(4), f16::packing(1)); + packing[2] = f32f32bis => |k| k.with_packing(f32::packing(4), f32::packing(1)); + packing[3] = f16f32 => |k| k.with_packing(f16::packing(4), f32::packing(1)); + packing[4] = f32f16 => |k| k.with_packing(f32::packing(4), f16::packing(1)); + packing[5] = q40f16 => |k| k.with_packing(pq40_r4(), f16::packing(1)); + packing[6] = q40f16se => |k| k.with_packing(pq40_r4_se(), f16::packing(1)); + packing[7] = q40f32 => |k| k.with_packing(pq40_r4(), f32::packing(1)); + quality(ImplementationQuality::Generic) + store(f16, f64) +} + +// f64 kernels +MMMRustKernel!(kernel:: => generic_f64_4x4(4,4) + quality(ImplementationQuality::Generic) + store(f16, f32)); +MMMRustKernel!(kernel:: => generic_f64_4x1(4,1) + quality(ImplementationQuality::Generic) + store(f16, f32)); + +// I32 kernels +MMMRustKernel! {kernel:: => generic_i32_4x4(4,4) + packing[1] = i8i8 => |k| k.with_packing(i8::packing(4), i8::packing(4)); + quality(ImplementationQuality::Generic) + store(i8) +} + +MMMRustKernel! {kernel:: => generic_i32_4x1(4,1) + packing[1] = i8i8 => |k| k.with_packing(i8::packing(4), i8::packing(1)); + quality(ImplementationQuality::Generic) + store(i8) +} + +// extra tests kernels +#[cfg(test)] +MMMRustKernel!(kernel:: => generic_f32_3x2(3,2) store(f16, f64)); + +#[cfg(test)] +MMMRustKernel! {kernel:: => generic_i32_3x2(3,2) + packing[1] = i8i8 => |k| k.with_packing(i8::packing(3), i8::packing(2)); + store(i8) +} + +pub fn plug(ops: &mut Ops) { + ops.mmm_impls.push(generic_f16_4x4.mmm()); + ops.mmm_impls.push(generic_f16_4x1.mmm()); + ops.mmm_impls.push(generic_f32_4x4.mmm()); + ops.mmm_impls.push(generic_f32_4x1.mmm()); + ops.mmm_impls.push(generic_f64_4x4.mmm()); + ops.mmm_impls.push(generic_f64_4x1.mmm()); + ops.mmm_impls.push(generic_i32_4x4.mmm()); + ops.mmm_impls.push(generic_i32_4x1.mmm()); +} + +#[cfg(test)] +mod test { + + #[test] + fn kits() { + let mut ops = crate::generic(); + super::plug(&mut ops); + } +} diff --git a/vendor/tract-linalg-0.22.1/src/generic/reduce.rs b/vendor/tract-linalg-0.22.1/src/generic/reduce.rs new file mode 100644 index 000000000..af38cfb22 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/generic/reduce.rs @@ -0,0 +1,187 @@ +// Reduce generic implementation +pub mod max { + pub use tract_data::internal::f16; + + reduce_impl_wrap!( + f32, + SMax4, + 4, + 4, + (), + f32::MIN, + fn run(x: &[f32], _: ()) -> f32 { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + *x.iter().max_by(|a, b| a.total_cmp(b)).unwrap() + }, + fn reduce_two(a: f32, b: f32) -> f32 { + a.max(b) + } + ); + + reduce_impl_wrap!( + f16, + HMax8, + 8, + 8, + (), + f16::MIN, + fn run(x: &[f16], _: ()) -> f16 { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + *x.iter().max_by(|a, b| a.total_cmp(b)).unwrap() + }, + fn reduce_two(a: f16, b: f16) -> f16 { + a.max(b) + } + ); + + #[cfg(test)] + #[macro_use] + pub mod s { + crate::max_frame_tests!(true, f32, crate::generic::reduce::max::SMax4); + } + + #[cfg(test)] + #[macro_use] + pub mod h { + use super::*; + crate::max_frame_tests!(true, f16, crate::generic::reduce::max::HMax8); + } +} + +// Reduce generic implementation +pub mod sum { + use crate::num_traits::Zero; + pub use tract_data::internal::f16; + + reduce_impl_wrap!( + f32, + SSum4, + 4, + 4, + (), + 0.0, + fn run(x: &[f32], _: ()) -> f32 { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter().sum::() + }, + fn reduce_two(a: f32, b: f32) -> f32 { + a + b + } + ); + + reduce_impl_wrap!( + f16, + HSum8, + 8, + 8, + (), + f16::zero(), + fn run(x: &[f16], _: ()) -> f16 { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter().sum::() + }, + fn reduce_two(a: f16, b: f16) -> f16 { + a + b + } + ); + + #[cfg(test)] + #[macro_use] + pub mod s { + crate::sum_frame_tests!(true, f32, crate::generic::reduce::sum::SSum4); + } + + #[cfg(test)] + #[macro_use] + pub mod h { + use super::*; + crate::sum_frame_tests!(true, f16, crate::generic::reduce::sum::HSum8); + } +} + +// Softmax generic implementation +pub mod softmax_l2 { + use crate::num_traits::Zero; + use tract_data::internal::f16; + + map_reduce_impl_wrap!( + f32, + SSoftMaxL2, + 4, + 4, + f32, + f32::MIN, + 0.0, + fn run(x: &mut [f32], max: f32) -> f32 { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + let mut sum = 0.; + for v in x.iter_mut() { + let y = *v - max; + let y = fast_compact_exp_f32(y); + *v = y; + sum += y; + } + sum + }, + fn reduce_two(a: f32, b: f32) -> f32 { + a + b + } + ); + + map_reduce_impl_wrap!( + f16, + HSoftMaxL2, + 8, + 8, + f16, + f16::MIN, + f16::zero(), + fn run(x: &mut [f16], max: f16) -> f16 { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + let mut sum = f16::zero(); + for v in x.iter_mut() { + let y = *v - max; + let y = f16::from_f32(fast_compact_exp_f32(y.to_f32())); + *v = y; + sum += y; + } + sum + }, + fn reduce_two(a: f16, b: f16) -> f16 { + a + b + } + ); + + // ported from https://github.com/gnuradio/volk/blob/master/kernels/volk/volk_32f_expfast_32f.h + // probably inspired from https://nic.schraudolph.org/pubs/Schraudolph99.pdf + // not that the cast to u32 deals with negative right, while implem in volk code are wrong in some + // corner cases (need a max(0,x) before the u32 conversion) + pub fn fast_compact_exp_f32(v: f32) -> f32 { + const MLN2: f32 = 0.6931471805f32; + const A: f32 = 8388608.0f32; + const B: f32 = 1065353216.0f32; + const C: f32 = 60801.0f32; + const SLOPE: f32 = A / MLN2; + const OFFSET: f32 = B - C; + f32::from_bits(((SLOPE * v) + OFFSET) as u32) + } + + #[cfg(test)] + #[macro_use] + pub mod s { + crate::softmax_l2_frame_tests!(true, f32, super::SSoftMaxL2); + } + + #[cfg(test)] + #[macro_use] + pub mod h { + use super::*; + crate::softmax_l2_frame_tests!(true, f16, HSoftMaxL2); + } +} diff --git a/vendor/tract-linalg-0.22.1/src/generic/rounding.rs b/vendor/tract-linalg-0.22.1/src/generic/rounding.rs new file mode 100644 index 000000000..97aed257c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/generic/rounding.rs @@ -0,0 +1,524 @@ +use crate::frame::mmm::*; +use std::hash::{Hash, Hasher}; +use std::ops::Mul; +use tract_data::prelude::f16; + +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct Scaler { + pub scale: f32, + pub mult: Option, + pub shift: isize, + pub policy: RoundingPolicy, +} + +impl Eq for Scaler {} + +#[allow(clippy::derived_hash_with_manual_eq)] +impl Hash for Scaler { + fn hash(&self, state: &mut H) + where + H: Hasher, + { + Hash::hash(&self.scale.to_bits(), state) + } +} + +impl Scaler { + pub fn new(scale: f32, policy: RoundingPolicy) -> Self { + let (mult, shift) = Self::convert_scale_to_mult_shift(scale); + Self { scale, mult, shift, policy } + } + + pub fn as_fused_spec(&self) -> FusedSpec<'_> { + if let Some(multiplier) = self.mult { + FusedSpec::QScale(self.shift, self.policy, multiplier) + } else if self.shift > 0 { + FusedSpec::RoundingShiftRight(self.shift as usize, self.policy) + } else { + FusedSpec::ShiftLeft((-self.shift) as usize) + } + } + + // FIXME: Only to avoid fused op breaking + pub fn from_fuse_params(shift: isize, policy: RoundingPolicy, mult: i32) -> Self { + let scale = mult as f32 * 2f32.powi(-(31 + shift as i32)); + Self { scale, mult: Some(mult), shift, policy } + } + + #[inline] + // This function convert a scale (actually a fraction of two integers Q/D) + // into an integer multiplier and a shift (the multiplier being 1/2D in Q0_31). + fn convert_scale_to_mult_shift(scale: f32) -> (Option, isize) { + // Zero is a special case to handle + if scale == 0.0 { + return (None, 0); + } + + // Convert f32 to bits representation with the following pattern + // Bit | 31 | 30-23 | 22-0 | + // | Sign | Exponent | Fraction | + let scale_bits = scale.to_bits(); + + // Get actual value of the exponent + let current_exponent = (scale_bits >> 23) & 0xff; + + // Extract fractional part of the float with: + // - 0x007fffff that represents the mask of the 23 lower bits (fractional part) + // (partial because it doesn't include the hidden bit (24) of the float representation) + let partial_frac = scale_bits & 0x007fffff; + + if partial_frac == 0 { + let shift = 127 - current_exponent as isize; + (None, shift) + } else { + // We add 0x800000 that represents the hidden bit set to one. + // Here the frac is encoded as a Q8_23. + let frac = partial_frac | 0x800000; + + // We rescale the result to be in Q0_31 + // We should have shifted the result by 8 but the frac value is in [1.0, 2.0) + // so we cannot do that (we would need one bit for the integer). + // Instead we devide the frac by two to be in [0.5, 1.0) in Q0_31 + // which lead to a shift of (8-1 = 7). + let half_frac = (frac << 7) as i32; + + // Compute the actual value of the shift + // Here, we remove one as half_frac needs to be multiplied by 2. + let shift = 127 - current_exponent as isize - 1; + (Some(half_frac), shift) + } + } +} + +impl Mul for Scaler { + type Output = f16; + + #[inline] + fn mul(self, rhs: f16) -> Self::Output { + f16::from_f32(self.scale) * rhs + } +} + +impl Mul for Scaler { + type Output = f32; + + #[inline] + fn mul(self, rhs: f32) -> Self::Output { + self.scale * rhs + } +} + +impl Mul for Scaler { + type Output = f64; + + #[inline] + fn mul(self, rhs: f64) -> Self::Output { + self.scale as f64 * rhs + } +} + +impl Mul for f16 { + type Output = f16; + + #[inline] + fn mul(self, rhs: Scaler) -> Self::Output { + rhs * self + } +} + +impl Mul for f32 { + type Output = f32; + + #[inline] + fn mul(self, rhs: Scaler) -> Self::Output { + rhs * self + } +} + +impl Mul for f64 { + type Output = f64; + + #[inline] + fn mul(self, rhs: Scaler) -> Self::Output { + rhs * self + } +} + +impl Mul for Scaler { + type Output = i32; + + #[inline] + fn mul(self, rhs: i32) -> Self::Output { + let (val, shift) = if let Some(multiplier) = self.mult { + (multiplier as i64 * rhs as i64, self.shift + 31) + } else { + (rhs as i64, self.shift) + }; + + // Round according to rounding policy + use RoundingPolicy::*; + if shift > 0 { + let half: i64 = 1 << (shift - 1); + let nudge: i64 = match self.policy { + Zero => -1, + MinusInf => -((val >= 0) as i64), + PlusInf => -((val <= 0) as i64), + Away => 0, + Even => ((val.abs() >> shift) & 0x1) - 1, + Odd => -((val.abs() >> shift) & 0x1), + _ => panic!(), + }; + + (val.signum() * ((val.abs() + half + nudge) >> shift)) as i32 + } else { + (val << -shift) as i32 + } + } +} + +impl Mul for i32 { + type Output = i32; + + #[inline] + fn mul(self, rhs: Scaler) -> Self::Output { + rhs * self + } +} + +pub trait ScaleShiftAndRound { + fn q_scale(self, scaler: Scaler) -> Self; + fn q_shl(self, shift: usize) -> Self; + fn q_shr(self, shift: usize, rp: RoundingPolicy) -> Self; +} + +impl ScaleShiftAndRound for f64 { + fn q_scale(self, scaler: Scaler) -> Self { + self * scaler + } + fn q_shl(self, shift: usize) -> Self { + self * 2f64.powi(shift as i32) + } + fn q_shr(self, shift: usize, _rp: RoundingPolicy) -> Self { + self * 2f64.powi(-(shift as i32)) + } +} + +impl ScaleShiftAndRound for f32 { + fn q_scale(self, scaler: Scaler) -> Self { + self * scaler + } + fn q_shl(self, shift: usize) -> Self { + self * 2f32.powi(shift as i32) + } + fn q_shr(self, shift: usize, _rp: RoundingPolicy) -> Self { + self * 2f32.powi(-(shift as i32)) + } +} + +impl ScaleShiftAndRound for f16 { + fn q_scale(self, scaler: Scaler) -> Self { + self * scaler + } + fn q_shl(self, shift: usize) -> Self { + self * f16::from_f32(2f32.powi(shift as i32)) + } + fn q_shr(self, shift: usize, _rp: RoundingPolicy) -> Self { + self * f16::from_f32(2f32.powi(-(shift as i32))) + } +} + +impl ScaleShiftAndRound for i32 { + fn q_scale(self, scaler: Scaler) -> Self { + self * scaler + } + fn q_shr(self, shift: usize, rp: RoundingPolicy) -> Self { + use RoundingPolicy::*; + let half: i32 = 1 << (shift - 1); + let nudge: i32 = match rp { + Zero => -1, + MinusInf => -((self >= 0) as i32), + PlusInf => -((self <= 0) as i32), + Away => 0, + Even => ((self.abs() >> shift) & 0x1) - 1, + Odd => -((self.abs() >> shift) & 0x1), + _ => panic!(), + }; + self.signum() * ((self.abs() + half + nudge) >> shift) + } + fn q_shl(self, shift: usize) -> Self { + self << shift + } +} + +// 6 / 4 -> 1.5 -> arrondi: 2. rien a faire +// 2 / 4 -> 0.5 -> arrondi: 1. veut 0 -> nudge = -1 + +#[cfg(test)] +mod test { + use super::RoundingPolicy::*; + use super::*; + + #[test] + fn test_scale_rounding_f32() { + assert_eq!(0f32.q_scale(Scaler::new(0.5, Zero)), 0.0); + assert_eq!(1f32.q_scale(Scaler::new(0.5, Zero)), 0.5); + assert_eq!(2f32.q_scale(Scaler::new(0.5, Zero)), 1.0); + assert_eq!(3f32.q_scale(Scaler::new(0.5, Zero)), 1.5); + assert_eq!((-1f32).q_scale(Scaler::new(0.5, Zero)), -0.5); + assert_eq!((-2f32).q_scale(Scaler::new(0.5, Zero)), -1.0); + assert_eq!((-3f32).q_scale(Scaler::new(0.5, Zero)), -1.5); + } + + #[test] + fn test_shift_rounding_zero() { + assert_eq!(0i32.q_shr(1, Zero), 0); + assert_eq!(1i32.q_shr(1, Zero), 0); + assert_eq!(2i32.q_shr(1, Zero), 1); + assert_eq!(3i32.q_shr(1, Zero), 1); + assert_eq!(0i32.q_shr(2, Zero), 0); + assert_eq!(1i32.q_shr(2, Zero), 0); + assert_eq!(2i32.q_shr(2, Zero), 0); + assert_eq!(3i32.q_shr(2, Zero), 1); + assert_eq!(4i32.q_shr(2, Zero), 1); + assert_eq!(5i32.q_shr(2, Zero), 1); + assert_eq!(6i32.q_shr(2, Zero), 1); + assert_eq!((-1i32).q_shr(2, Zero), 0); + assert_eq!((-2i32).q_shr(2, Zero), 0); + assert_eq!((-3i32).q_shr(2, Zero), -1); + assert_eq!((-4i32).q_shr(2, Zero), -1); + assert_eq!((-5i32).q_shr(2, Zero), -1); + assert_eq!((-6i32).q_shr(2, Zero), -1); + } + + #[test] + fn test_scale_rounding_zero() { + assert_eq!(0i32.q_scale(Scaler::new(0.5, Zero)), 0); + assert_eq!(1i32.q_scale(Scaler::new(0.5, Zero)), 0); + assert_eq!(2i32.q_scale(Scaler::new(0.5, Zero)), 1); + assert_eq!(3i32.q_scale(Scaler::new(0.5, Zero)), 1); + assert_eq!((-1i32).q_scale(Scaler::new(0.5, Zero)), 0); + assert_eq!((-2i32).q_scale(Scaler::new(0.5, Zero)), -1); + assert_eq!((-3i32).q_scale(Scaler::new(0.5, Zero)), -1); + assert_eq!(2i32.q_scale(Scaler::new(0.25, Zero)), 0); + assert_eq!(3i32.q_scale(Scaler::new(0.25, Zero)), 1); + assert_eq!(4i32.q_scale(Scaler::new(0.25, Zero)), 1); + assert_eq!(5i32.q_scale(Scaler::new(0.25, Zero)), 1); + assert_eq!(6i32.q_scale(Scaler::new(0.25, Zero)), 1); + assert_eq!((-2i32).q_scale(Scaler::new(0.25, Zero)), 0); + assert_eq!((-3i32).q_scale(Scaler::new(0.25, Zero)), -1); + assert_eq!((-4i32).q_scale(Scaler::new(0.25, Zero)), -1); + assert_eq!((-5i32).q_scale(Scaler::new(0.25, Zero)), -1); + assert_eq!((-6i32).q_scale(Scaler::new(0.25, Zero)), -1); + } + + #[test] + fn test_shift_rounding_away() { + assert_eq!(0i32.q_shr(1, Away), 0); + assert_eq!(1i32.q_shr(1, Away), 1); + assert_eq!(2i32.q_shr(1, Away), 1); + assert_eq!(3i32.q_shr(1, Away), 2); + assert_eq!(0i32.q_shr(2, Away), 0); + assert_eq!(1i32.q_shr(2, Away), 0); + assert_eq!(2i32.q_shr(2, Away), 1); + assert_eq!(3i32.q_shr(2, Away), 1); + assert_eq!(4i32.q_shr(2, Away), 1); + assert_eq!(5i32.q_shr(2, Away), 1); + assert_eq!(6i32.q_shr(2, Away), 2); + assert_eq!((-1i32).q_shr(2, Away), 0); + assert_eq!((-2i32).q_shr(2, Away), -1); + assert_eq!((-3i32).q_shr(2, Away), -1); + assert_eq!((-4i32).q_shr(2, Away), -1); + assert_eq!((-5i32).q_shr(2, Away), -1); + assert_eq!((-6i32).q_shr(2, Away), -2); + } + + #[test] + fn test_scale_rounding_away() { + assert_eq!(0i32.q_scale(Scaler::new(0.5, Away)), 0); + assert_eq!(1i32.q_scale(Scaler::new(0.5, Away)), 1); + assert_eq!(2i32.q_scale(Scaler::new(0.5, Away)), 1); + assert_eq!(3i32.q_scale(Scaler::new(0.5, Away)), 2); + assert_eq!((-1i32).q_scale(Scaler::new(0.5, Away)), -1); + assert_eq!((-2i32).q_scale(Scaler::new(0.5, Away)), -1); + assert_eq!((-3i32).q_scale(Scaler::new(0.5, Away)), -2); + assert_eq!(2i32.q_scale(Scaler::new(0.25, Away)), 1); + assert_eq!(3i32.q_scale(Scaler::new(0.25, Away)), 1); + assert_eq!(4i32.q_scale(Scaler::new(0.25, Away)), 1); + assert_eq!(5i32.q_scale(Scaler::new(0.25, Away)), 1); + assert_eq!(6i32.q_scale(Scaler::new(0.25, Away)), 2); + assert_eq!((-2i32).q_scale(Scaler::new(0.25, Away)), -1); + assert_eq!((-3i32).q_scale(Scaler::new(0.25, Away)), -1); + assert_eq!((-4i32).q_scale(Scaler::new(0.25, Away)), -1); + assert_eq!((-5i32).q_scale(Scaler::new(0.25, Away)), -1); + assert_eq!((-6i32).q_scale(Scaler::new(0.25, Away)), -2); + } + + #[test] + fn test_shift_rounding_plus_inf() { + assert_eq!(0i32.q_shr(1, PlusInf), 0); + assert_eq!(1i32.q_shr(1, PlusInf), 1); + assert_eq!(2i32.q_shr(1, PlusInf), 1); + assert_eq!(3i32.q_shr(1, PlusInf), 2); + assert_eq!(0i32.q_shr(2, PlusInf), 0); + assert_eq!(1i32.q_shr(2, PlusInf), 0); + assert_eq!(2i32.q_shr(2, PlusInf), 1); + assert_eq!(3i32.q_shr(2, PlusInf), 1); + assert_eq!(4i32.q_shr(2, PlusInf), 1); + assert_eq!(5i32.q_shr(2, PlusInf), 1); + assert_eq!(6i32.q_shr(2, PlusInf), 2); + assert_eq!((-1i32).q_shr(2, PlusInf), 0); + assert_eq!((-2i32).q_shr(2, PlusInf), 0); + assert_eq!((-3i32).q_shr(2, PlusInf), -1); + assert_eq!((-4i32).q_shr(2, PlusInf), -1); + assert_eq!((-5i32).q_shr(2, PlusInf), -1); + assert_eq!((-6i32).q_shr(2, PlusInf), -1); + } + + #[test] + fn test_scale_rounding_plus_inf() { + assert_eq!(0i32.q_scale(Scaler::new(0.5, PlusInf)), 0); + assert_eq!(1i32.q_scale(Scaler::new(0.5, PlusInf)), 1); + assert_eq!(2i32.q_scale(Scaler::new(0.5, PlusInf)), 1); + assert_eq!(3i32.q_scale(Scaler::new(0.5, PlusInf)), 2); + assert_eq!((-1i32).q_scale(Scaler::new(0.5, PlusInf)), 0); + assert_eq!((-2i32).q_scale(Scaler::new(0.5, PlusInf)), -1); + assert_eq!((-3i32).q_scale(Scaler::new(0.5, PlusInf)), -1); + assert_eq!(2i32.q_scale(Scaler::new(0.25, PlusInf)), 1); + assert_eq!(3i32.q_scale(Scaler::new(0.25, PlusInf)), 1); + assert_eq!(4i32.q_scale(Scaler::new(0.25, PlusInf)), 1); + assert_eq!(5i32.q_scale(Scaler::new(0.25, PlusInf)), 1); + assert_eq!(6i32.q_scale(Scaler::new(0.25, PlusInf)), 2); + assert_eq!((-2i32).q_scale(Scaler::new(0.25, PlusInf)), 0); + assert_eq!((-3i32).q_scale(Scaler::new(0.25, PlusInf)), -1); + assert_eq!((-4i32).q_scale(Scaler::new(0.25, PlusInf)), -1); + assert_eq!((-5i32).q_scale(Scaler::new(0.25, PlusInf)), -1); + assert_eq!((-6i32).q_scale(Scaler::new(0.25, PlusInf)), -1); + } + + #[test] + fn test_shift_rounding_minus_inf() { + assert_eq!(0i32.q_shr(1, MinusInf), 0); + assert_eq!(1i32.q_shr(1, MinusInf), 0); + assert_eq!(2i32.q_shr(1, MinusInf), 1); + assert_eq!(3i32.q_shr(1, MinusInf), 1); + assert_eq!(0i32.q_shr(2, MinusInf), 0); + assert_eq!(1i32.q_shr(2, MinusInf), 0); + assert_eq!(2i32.q_shr(2, MinusInf), 0); + assert_eq!(3i32.q_shr(2, MinusInf), 1); + assert_eq!(4i32.q_shr(2, MinusInf), 1); + assert_eq!(5i32.q_shr(2, MinusInf), 1); + assert_eq!(6i32.q_shr(2, MinusInf), 1); + assert_eq!((-1i32).q_shr(2, MinusInf), 0); + assert_eq!((-2i32).q_shr(2, MinusInf), -1); + assert_eq!((-3i32).q_shr(2, MinusInf), -1); + assert_eq!((-4i32).q_shr(2, MinusInf), -1); + assert_eq!((-5i32).q_shr(2, MinusInf), -1); + assert_eq!((-6i32).q_shr(2, MinusInf), -2); + } + + #[test] + fn test_scale_rounding_minus_inf() { + assert_eq!(0i32.q_scale(Scaler::new(0.5, MinusInf)), 0); + assert_eq!(1i32.q_scale(Scaler::new(0.5, MinusInf)), 0); + assert_eq!(2i32.q_scale(Scaler::new(0.5, MinusInf)), 1); + assert_eq!(3i32.q_scale(Scaler::new(0.5, MinusInf)), 1); + assert_eq!((-1i32).q_scale(Scaler::new(0.5, MinusInf)), -1); + assert_eq!((-2i32).q_scale(Scaler::new(0.5, MinusInf)), -1); + assert_eq!((-3i32).q_scale(Scaler::new(0.5, MinusInf)), -2); + assert_eq!(2i32.q_scale(Scaler::new(0.25, MinusInf)), 0); + assert_eq!(3i32.q_scale(Scaler::new(0.25, MinusInf)), 1); + assert_eq!(4i32.q_scale(Scaler::new(0.25, MinusInf)), 1); + assert_eq!(5i32.q_scale(Scaler::new(0.25, MinusInf)), 1); + assert_eq!(6i32.q_scale(Scaler::new(0.25, MinusInf)), 1); + assert_eq!((-2i32).q_scale(Scaler::new(0.25, MinusInf)), -1); + assert_eq!((-3i32).q_scale(Scaler::new(0.25, MinusInf)), -1); + assert_eq!((-4i32).q_scale(Scaler::new(0.25, MinusInf)), -1); + assert_eq!((-5i32).q_scale(Scaler::new(0.25, MinusInf)), -1); + assert_eq!((-6i32).q_scale(Scaler::new(0.25, MinusInf)), -2); + //assert_eq!((-9i32).q_scale(ONE_OVER_TWO_IN_Q0_30, 5, MinusInf), 0); + } + + #[test] + fn test_shift_rounding_even() { + assert_eq!(0i32.q_shr(1, Even), 0); + assert_eq!(1i32.q_shr(1, Even), 0); + assert_eq!(2i32.q_shr(1, Even), 1); + assert_eq!(3i32.q_shr(1, Even), 2); + assert_eq!(0i32.q_shr(2, Even), 0); + assert_eq!(1i32.q_shr(2, Even), 0); + assert_eq!(2i32.q_shr(2, Even), 0); + assert_eq!(3i32.q_shr(2, Even), 1); + assert_eq!(4i32.q_shr(2, Even), 1); + assert_eq!(5i32.q_shr(2, Even), 1); + assert_eq!(6i32.q_shr(2, Even), 2); + assert_eq!((-1i32).q_shr(2, Even), 0); + assert_eq!((-2i32).q_shr(2, Even), 0); + assert_eq!((-3i32).q_shr(2, Even), -1); + assert_eq!((-4i32).q_shr(2, Even), -1); + assert_eq!((-5i32).q_shr(2, Even), -1); + assert_eq!((-6i32).q_shr(2, Even), -2); + } + + #[test] + fn test_scale_rounding_even() { + assert_eq!(0i32.q_scale(Scaler::new(0.5, Even)), 0); + assert_eq!(1i32.q_scale(Scaler::new(0.5, Even)), 0); + assert_eq!(2i32.q_scale(Scaler::new(0.5, Even)), 1); + assert_eq!(3i32.q_scale(Scaler::new(0.5, Even)), 2); + assert_eq!((-1i32).q_scale(Scaler::new(0.5, Even)), 0); + assert_eq!((-2i32).q_scale(Scaler::new(0.5, Even)), -1); + assert_eq!((-3i32).q_scale(Scaler::new(0.5, Even)), -2); + assert_eq!(2i32.q_scale(Scaler::new(0.25, Even)), 0); + assert_eq!(3i32.q_scale(Scaler::new(0.25, Even)), 1); + assert_eq!(4i32.q_scale(Scaler::new(0.25, Even)), 1); + assert_eq!(5i32.q_scale(Scaler::new(0.25, Even)), 1); + assert_eq!(6i32.q_scale(Scaler::new(0.25, Even)), 2); + assert_eq!((-2i32).q_scale(Scaler::new(0.25, Even)), 0); + assert_eq!((-3i32).q_scale(Scaler::new(0.25, Even)), -1); + assert_eq!((-4i32).q_scale(Scaler::new(0.25, Even)), -1); + assert_eq!((-5i32).q_scale(Scaler::new(0.25, Even)), -1); + assert_eq!((-6i32).q_scale(Scaler::new(0.25, Even)), -2); + } + + #[test] + fn test_shift_rounding_odd() { + assert_eq!(0i32.q_shr(1, Odd), 0); + assert_eq!(1i32.q_shr(1, Odd), 1); + assert_eq!(2i32.q_shr(1, Odd), 1); + assert_eq!(3i32.q_shr(1, Odd), 1); + assert_eq!(0i32.q_shr(2, Odd), 0); + assert_eq!(1i32.q_shr(2, Odd), 0); + assert_eq!(2i32.q_shr(2, Odd), 1); + assert_eq!(3i32.q_shr(2, Odd), 1); + assert_eq!(4i32.q_shr(2, Odd), 1); + assert_eq!(5i32.q_shr(2, Odd), 1); + assert_eq!(6i32.q_shr(2, Odd), 1); + assert_eq!((-1i32).q_shr(2, Odd), 0); + assert_eq!((-2i32).q_shr(2, Odd), -1); + assert_eq!((-3i32).q_shr(2, Odd), -1); + assert_eq!((-4i32).q_shr(2, Odd), -1); + assert_eq!((-5i32).q_shr(2, Odd), -1); + assert_eq!((-6i32).q_shr(2, Odd), -1); + } + + #[test] + fn test_scale_rounding_odd() { + assert_eq!(0i32.q_scale(Scaler::new(0.5, Odd)), 0); + assert_eq!(1i32.q_scale(Scaler::new(0.5, Odd)), 1); + assert_eq!(2i32.q_scale(Scaler::new(0.5, Odd)), 1); + assert_eq!(3i32.q_scale(Scaler::new(0.5, Odd)), 1); + assert_eq!((-1i32).q_scale(Scaler::new(0.5, Odd)), -1); + assert_eq!((-2i32).q_scale(Scaler::new(0.5, Odd)), -1); + assert_eq!((-3i32).q_scale(Scaler::new(0.5, Odd)), -1); + assert_eq!(2i32.q_scale(Scaler::new(0.25, Odd)), 1); + assert_eq!(3i32.q_scale(Scaler::new(0.25, Odd)), 1); + assert_eq!(4i32.q_scale(Scaler::new(0.25, Odd)), 1); + assert_eq!(5i32.q_scale(Scaler::new(0.25, Odd)), 1); + assert_eq!(6i32.q_scale(Scaler::new(0.25, Odd)), 1); + assert_eq!((-2i32).q_scale(Scaler::new(0.25, Odd)), -1); + assert_eq!((-3i32).q_scale(Scaler::new(0.25, Odd)), -1); + assert_eq!((-4i32).q_scale(Scaler::new(0.25, Odd)), -1); + assert_eq!((-5i32).q_scale(Scaler::new(0.25, Odd)), -1); + assert_eq!((-6i32).q_scale(Scaler::new(0.25, Odd)), -1); + } +} diff --git a/vendor/tract-linalg-0.22.1/src/generic/sigmoid.rs b/vendor/tract-linalg-0.22.1/src/generic/sigmoid.rs new file mode 100644 index 000000000..c344757d4 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/generic/sigmoid.rs @@ -0,0 +1,138 @@ +#![allow(clippy::excessive_precision)] +use crate::frame::element_wise::ElementWiseKer; +use tract_data::internal::*; + +pub fn ssigmoid(x: f32) -> f32 { + const LOW: f32 = -18.6; + const HIGH: f32 = -LOW; + + const ALPHA_13: f32 = -4.433153405e-18; + const ALPHA_11: f32 = 1.169974371e-14; + const ALPHA_9: f32 = -1.875289645e-11; + const ALPHA_7: f32 = 4.257889523e-8; + const ALPHA_5: f32 = 0.00004811817576; + const ALPHA_3: f32 = 0.008163842030; + const ALPHA_1: f32 = 0.2499999971; + const BETA_6: f32 = 3.922935744e-6; + const BETA_4: f32 = 0.001524872358; + const BETA_2: f32 = 0.1159886749; + const BETA_0: f32 = 1.0; + + let x = x.clamp(LOW, HIGH); + + let x2 = x * x; + + let p = ALPHA_13; + let p = x2 * p + ALPHA_11; + let p = x2 * p + ALPHA_9; + let p = x2 * p + ALPHA_7; + let p = x2 * p + ALPHA_5; + let p = x2 * p + ALPHA_3; + let p = x2 * p + ALPHA_1; + let p = p * x; + + let q = BETA_6; + let q = x2 * q + BETA_4; + let q = x2 * q + BETA_2; + let q = x2 * q + BETA_0; + + p / q + 0.5 +} + +pub fn hsigmoid(x: f16) -> f16 { + /* + * (x (0.249895 + x^2 (0.00400222 - 0.0000124702 x^2))) + * / + * (1. + 0.098734 x^2) + */ + + const LOW: f16 = f16::from_f32_const(-6.92); + const HIGH: f16 = f16::from_f32_const(6.92); + + const ALPHA_5: f16 = f16::from_f32_const(-0.0000124702); + const ALPHA_3: f16 = f16::from_f32_const(0.00400222); + const ALPHA_1: f16 = f16::from_f32_const(0.249895); + + const BETA_2: f16 = f16::from_f32_const(0.098734); + const BETA_0: f16 = f16::from_f32_const(1.0); + + let x = x.clamp(LOW, HIGH); + + let x2 = x * x; + + let p = ALPHA_5; + let p = x2 * p + ALPHA_3; + let p = x2 * p + ALPHA_1; + let p = p * x; + + let q = BETA_2; + let q = x2 * q + BETA_0; + + p / q + f16::from_f32_const(0.5) +} + +#[derive(Clone, Debug)] +pub struct SSigmoid4; + +impl ElementWiseKer for SSigmoid4 { + fn name() -> &'static str { + "generic" + } + + fn alignment_bytes() -> usize { + 16 + } + + fn alignment_items() -> usize { + 4 + } + + fn nr() -> usize { + 4 + } + + fn run(x: &mut [f32], _: ()) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px = ssigmoid(*px)) + } +} + +#[derive(Clone, Debug)] +pub struct HSigmoid8; + +impl ElementWiseKer for HSigmoid8 { + fn name() -> &'static str { + "generic" + } + + fn alignment_bytes() -> usize { + 16 + } + + fn alignment_items() -> usize { + 4 + } + + fn nr() -> usize { + 8 + } + + fn run(x: &mut [f16], _: ()) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px = hsigmoid(*px)) + } +} + +#[cfg(test)] +#[macro_use] +pub mod s { + sigmoid_frame_tests!(true, f32, crate::generic::sigmoid::SSigmoid4); +} + +#[cfg(test)] +#[macro_use] +pub mod h { + sigmoid_frame_tests!(true, tract_data::internal::f16, crate::generic::sigmoid::HSigmoid8); +} diff --git a/vendor/tract-linalg-0.22.1/src/generic/tanh.rs b/vendor/tract-linalg-0.22.1/src/generic/tanh.rs new file mode 100644 index 000000000..2c7542dd2 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/generic/tanh.rs @@ -0,0 +1,133 @@ +#![allow(clippy::excessive_precision)] +use crate::frame::element_wise::ElementWiseKer; +use tract_data::internal::*; + +pub fn stanh(x: f32) -> f32 { + const LOW: f32 = -8.9; + const HIGH: f32 = 8.9; + + const ALPHA_13: f32 = -8.488492677e-14; + const ALPHA_11: f32 = 5.277853000e-11; + const ALPHA_9: f32 = -2.022500419e-8; + const ALPHA_7: f32 = 0.00001115424833; + const ALPHA_5: f32 = 0.003103950131; + const ALPHA_3: f32 = 0.1308400453; + const ALPHA_1: f32 = 0.9999999934; + + const BETA_6: f32 = 0.0002546136580; + const BETA_4: f32 = 0.02449515379; + const BETA_2: f32 = 0.4641733162; + const BETA_0: f32 = 1.0; + + let x = x.clamp(LOW, HIGH); + + let x2 = x * x; + + let p = ALPHA_13; + let p = x2 * p + ALPHA_11; + let p = x2 * p + ALPHA_9; + let p = x2 * p + ALPHA_7; + let p = x2 * p + ALPHA_5; + let p = x2 * p + ALPHA_3; + let p = x2 * p + ALPHA_1; + let p = p * x; + + let q = BETA_6; + let q = x2 * q + BETA_4; + let q = x2 * q + BETA_2; + let q = x2 * q + BETA_0; + + p / q +} + +pub fn htanh(x: f16) -> f16 { + const LOW: f16 = f16::from_f32_const(-3.84); + const HIGH: f16 = f16::from_f32_const(3.84); + + const ALPHA_3: f16 = f16::from_f32_const(0.082654955); + const ALPHA_1: f16 = f16::from_f32_const(0.99963124); + + const BETA_4: f16 = f16::from_f32_const(0.0065383179); + const BETA_2: f16 = f16::from_f32_const(0.41401828); + const BETA_0: f16 = f16::from_f32_const(1.0); + + let x = x.clamp(LOW, HIGH); + + let x2 = x * x; + + let p = ALPHA_3; + let p = x2 * p + ALPHA_1; + let p = p * x; + + let q = BETA_4; + let q = x2 * q + BETA_2; + let q = x2 * q + BETA_0; + + p / q +} + +#[derive(Clone, Debug)] +pub struct STanh4; + +impl ElementWiseKer for STanh4 { + fn name() -> &'static str { + "generic" + } + + fn alignment_items() -> usize { + 16 + } + + fn alignment_bytes() -> usize { + 16 + } + + fn nr() -> usize { + 4 + } + + fn run(x: &mut [f32], _: ()) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px = stanh(*px)) + } +} + +#[cfg(test)] +#[macro_use] +pub mod s { + tanh_frame_tests!(true, f32, crate::generic::tanh::STanh4); +} + +#[derive(Clone, Debug)] +pub struct HTanh8; + +impl ElementWiseKer for HTanh8 { + fn name() -> &'static str { + "generic" + } + + fn alignment_items() -> usize { + 16 + } + + fn alignment_bytes() -> usize { + 16 + } + + fn nr() -> usize { + 8 + } + + fn run(x: &mut [f16], _: ()) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + x.iter_mut().for_each(|px| *px = htanh(*px)) + } +} + +#[cfg(test)] +#[macro_use] +pub mod h { + tanh_frame_tests!(true, tract_data::internal::f16, crate::generic::tanh::HTanh8); +} diff --git a/vendor/tract-linalg-0.22.1/src/generic/unicast.rs b/vendor/tract-linalg-0.22.1/src/generic/unicast.rs new file mode 100644 index 000000000..2d7d4875b --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/generic/unicast.rs @@ -0,0 +1,194 @@ +pub use tract_data::internal::f16; +unicast_impl_wrap!( + f32, + SUnicastMul4, + 4, + 4, + fn run(a: &mut [f32], b: &[f32]) { + debug_assert!(a.len() == b.len()); + debug_assert!(a.len() % Self::nr() == 0); + debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0); + debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0); + a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a *= b) + } +); + +unicast_impl_wrap!( + f16, + HUnicastMul8, + 8, + 8, + fn run(a: &mut [f16], b: &[f16]) { + debug_assert!(a.len() == b.len()); + debug_assert!(a.len() % Self::nr() == 0); + debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0); + debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0); + a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a *= b) + } +); + +unicast_impl_wrap!( + f32, + SUnicastAdd4, + 4, + 4, + fn run(a: &mut [f32], b: &[f32]) { + debug_assert!(a.len() == b.len()); + debug_assert!(a.len() % Self::nr() == 0); + debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0); + debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0); + a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a += b) + } +); + +unicast_impl_wrap!( + f16, + HUnicastAdd8, + 8, + 8, + fn run(a: &mut [f16], b: &[f16]) { + debug_assert!(a.len() == b.len()); + debug_assert!(a.len() % Self::nr() == 0); + debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0); + debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0); + a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a += b) + } +); + +unicast_impl_wrap!( + f32, + SUnicastSub4, + 4, + 4, + fn run(a: &mut [f32], b: &[f32]) { + debug_assert!(a.len() == b.len()); + debug_assert!(a.len() % Self::nr() == 0); + debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0); + debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0); + a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a -= b) + } +); + +unicast_impl_wrap!( + f16, + HUnicastSub8, + 8, + 8, + fn run(a: &mut [f16], b: &[f16]) { + debug_assert!(a.len() == b.len()); + debug_assert!(a.len() % Self::nr() == 0); + debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0); + debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0); + a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a -= b) + } +); + +unicast_impl_wrap!( + f32, + SUnicastSubF4, + 4, + 4, + fn run(a: &mut [f32], b: &[f32]) { + debug_assert!(a.len() == b.len()); + debug_assert!(a.len() % Self::nr() == 0); + debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0); + debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0); + a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a = *b - *a) + } +); + +unicast_impl_wrap!( + f16, + HUnicastSubF8, + 8, + 8, + fn run(a: &mut [f16], b: &[f16]) { + debug_assert!(a.len() == b.len()); + debug_assert!(a.len() % Self::nr() == 0); + debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0); + debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0); + a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a = *b - *a) + } +); + +unicast_impl_wrap!( + f32, + SUnicastMin4, + 4, + 4, + fn run(a: &mut [f32], b: &[f32]) { + debug_assert!(a.len() == b.len()); + debug_assert!(a.len() % Self::nr() == 0); + debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0); + debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0); + a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a = a.min(*b)) + } +); + +unicast_impl_wrap!( + f16, + HUnicastMin8, + 8, + 8, + fn run(a: &mut [f16], b: &[f16]) { + debug_assert!(a.len() == b.len()); + debug_assert!(a.len() % Self::nr() == 0); + debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0); + debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0); + a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a = a.min(*b)) + } +); + +unicast_impl_wrap!( + f32, + SUnicastMax4, + 4, + 4, + fn run(a: &mut [f32], b: &[f32]) { + debug_assert!(a.len() == b.len()); + debug_assert!(a.len() % Self::nr() == 0); + debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0); + debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0); + a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a = a.max(*b)) + } +); + +unicast_impl_wrap!( + f16, + HUnicastMax8, + 8, + 8, + fn run(a: &mut [f16], b: &[f16]) { + debug_assert!(a.len() == b.len()); + debug_assert!(a.len() % Self::nr() == 0); + debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0); + debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0); + a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a = a.max(*b)) + } +); + +#[cfg(test)] +#[macro_use] +pub mod s { + use super::*; + use proptest::strategy::Strategy; + crate::unicast_frame_tests!(true, f32, SUnicastMul4, |a, b| a * b); + crate::unicast_frame_tests!(true, f32, SUnicastAdd4, |a, b| a + b); + crate::unicast_frame_tests!(true, f32, SUnicastSub4, |a, b| a - b); + crate::unicast_frame_tests!(true, f32, SUnicastSubF4, |a, b| b - a); + crate::unicast_frame_tests!(true, f32, SUnicastMin4, |a, b| a.min(b)); + crate::unicast_frame_tests!(true, f32, SUnicastMax4, |a, b| a.max(b)); +} + +#[cfg(test)] +#[macro_use] +pub mod h { + use super::*; + use proptest::strategy::Strategy; + crate::unicast_frame_tests!(true, f16, HUnicastMul8, |a, b| a * b); + crate::unicast_frame_tests!(true, f16, HUnicastAdd8, |a, b| a + b); + crate::unicast_frame_tests!(true, f16, HUnicastSub8, |a, b| a - b); + crate::unicast_frame_tests!(true, f16, HUnicastSubF8, |a, b| b - a); + crate::unicast_frame_tests!(true, f16, HUnicastMin8, |a, b| a.min(b)); + crate::unicast_frame_tests!(true, f16, HUnicastMax8, |a, b| a.max(b)); +} diff --git a/vendor/tract-linalg-0.22.1/src/hwbench/bandwidth.rs b/vendor/tract-linalg-0.22.1/src/hwbench/bandwidth.rs new file mode 100644 index 000000000..74c6a0e50 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/hwbench/bandwidth.rs @@ -0,0 +1,159 @@ +use tract_data::itertools::Itertools; +use tract_data::prelude::Blob; + +use super::runner; + +#[cfg(target_arch = "x86_64")] +static mut HAS_AVX512: bool = false; + +#[cfg(target_arch = "x86_64")] +#[inline(never)] +fn load_a_slice(slice: &[u8], loops: usize) { + unsafe { + if HAS_AVX512 { + for _ in 0..loops { + let mut ptr = slice.as_ptr(); + let end = ptr.add(slice.len()); + while ptr < end { + std::arch::asm!(" + vmovaps zmm0, [rsi] + vmovaps zmm1, [rsi + 64] + vmovaps zmm2, [rsi + 128] + vmovaps zmm3, [rsi + 192] + vmovaps zmm4, [rsi + 256] + vmovaps zmm5, [rsi + 320] + vmovaps zmm6, [rsi + 384] + vmovaps zmm7, [rsi + 448] + ", inout("rsi") ptr, + out("zmm0") _, + out("zmm1") _, + ); + ptr = ptr.add(512); + } + } + } else { + let mut ptr = slice.as_ptr(); + let end = ptr.add(slice.len()); + for _ in 0..loops { + while ptr < end { + std::arch::asm!(" + vmovaps ymm0, [rsi] + vmovaps ymm1, [rsi + 32] + vmovaps ymm2, [rsi + 64] + vmovaps ymm3, [rsi + 96] + ", inout("rsi") ptr, + out("ymm0") _, + out("ymm1") _, + out("ymm2") _, + out("ymm3") _, + ); + ptr = ptr.add(128); + } + } + } + } +} + +#[cfg(target_arch = "aarch64")] +#[inline] +fn load_a_slice(slice: &[u8], loops: usize) { + unsafe { + for _ in 0..loops { + let mut ptr = slice.as_ptr(); + let end = ptr.add(slice.len()); + while ptr < end { + std::arch::asm!(" + ld1 {{v0.16b-v3.16b}}, [x0], #64 + ld1 {{v4.16b-v7.16b}}, [x0], #64 + ", inout("x0") ptr, + out("v0") _, + out("v1") _, + out("v2") _, + out("v3") _, + out("v4") _, + out("v5") _, + out("v6") _, + out("v7") _, + ); + } + } + } +} + +#[cfg(target_arch = "arm")] +#[inline(never)] +fn load_a_slice(slice: &[u8], loops: usize) { + unsafe { + for _ in 0..loops { + let mut ptr = slice.as_ptr(); + let end = ptr.add(slice.len()); + while ptr < end { + std::arch::asm!(" + vldmia r1!, {{q0-q3}} + vldmia r1!, {{q4-q7}} + ", inout("r1") ptr, + out("d0") _, out("d1") _, out("d2") _, out("d3") _, + out("d4") _, out("d5") _, out("d6") _, out("d7") _, + out("d8") _, out("d9") _, out("d10") _, out("d11") _, + out("d12") _, out("d13") _, out("d14") _, out("d15") _, + ); + } + } + } +} + +fn bandwidth_seq(slice_len: usize, threads: usize) -> f64 { + #[cfg(target_arch = "x86_64")] + unsafe { + HAS_AVX512 = std::is_x86_feature_detected!("avx512f"); + } + std::thread::scope(|s| { + let gards = (0..threads) + .map(|_| { + s.spawn(|| { + let buffer = unsafe { Blob::new_for_size_and_align(slice_len, 1024) }; + runner::run_bench(|loops| load_a_slice(&buffer, loops)) + }) + }) + .collect_vec(); + let time = gards.into_iter().map(|t| t.join().unwrap()).sum::() / threads as f64; + (slice_len * threads) as f64 / time + }) +} + +pub fn what_is_big() -> usize { + 1024 * 1024 * if cfg!(target_arch = "arm") { 64 } else { 256 } +} + +pub fn l1_bandwidth_seq(threads: usize) -> f64 { + // [1024, 2048, 4096, 8192, 16384, 32768, 65536] + [1024] + .into_iter() + .map(|slice_len| bandwidth_seq(slice_len, threads)) + .max_by_key(|x| *x as i64) + .unwrap() +} + +pub fn main_memory_bandwith_seq(threads: usize) -> f64 { + bandwidth_seq(what_is_big(), threads) +} + +#[ignore] +#[test] +fn b() { + let max = what_is_big(); + for threads in [1, 2, 3, 4] { + println!("Threads: {}", threads); + for size in (0..) + .flat_map(|po2| (0..2).map(move |f| (1024 + 512 * f) * (1 << po2))) + .take_while(|&s| s < max) + { + let bw = bandwidth_seq(size, threads); + println!( + "threads: {threads} slice: {} KiB bandwidth: {} GiB/s", + size as f64 / 1024., + (bw / (1024. * 1024. * 1024.)) as usize + ); + } + } +} diff --git a/vendor/tract-linalg-0.22.1/src/hwbench/mod.rs b/vendor/tract-linalg-0.22.1/src/hwbench/mod.rs new file mode 100644 index 000000000..373235856 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/hwbench/mod.rs @@ -0,0 +1,4 @@ +pub mod runner; + +#[cfg(feature = "hwbench")] +pub mod bandwidth; diff --git a/vendor/tract-linalg-0.22.1/src/hwbench/runner.rs b/vendor/tract-linalg-0.22.1/src/hwbench/runner.rs new file mode 100644 index 000000000..97b62c04f --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/hwbench/runner.rs @@ -0,0 +1,122 @@ +#![allow(unused_macros)] + +use std::time::Duration; +use std::time::Instant; + +#[macro_export] +macro_rules! r1 { ($($stat:stmt)*) => { $( $stat )* } } +#[macro_export] +macro_rules! r2 { ($($stat:stmt)*) => { $( $stat )* $( $stat )* } } +#[macro_export] +macro_rules! r4 { ($($stat:stmt)*) => { r2!(r2!($($stat)*)) }} +#[macro_export] +macro_rules! r8 { ($($stat:stmt)*) => { r2!(r4!($($stat)*)) }} +#[macro_export] +macro_rules! r16 { ($($stat:stmt)*) => { r2!(r8!($($stat)*)) }} +#[macro_export] +macro_rules! r32 { ($($stat:stmt)*) => { r2!(r16!($($stat)*)) }} +#[macro_export] +macro_rules! r64 { ($($stat:stmt)*) => { r2!(r32!($($stat)*)) }} +#[macro_export] +macro_rules! r128 { ($($stat:stmt)*) => { r2!(r64!($($stat)*)) }} +#[macro_export] +macro_rules! r256 { ($($stat:stmt)*) => { r2!(r128!($($stat)*)) }} +#[macro_export] +macro_rules! r512 { ($($stat:stmt)*) => { r2!(r256!($($stat)*)) }} +#[macro_export] +macro_rules! r1024 { ($($stat:stmt)*) => { r2!(r512!($($stat)*)) }} +#[macro_export] +macro_rules! r2048 { ($($stat:stmt)*) => { r2!(r1024!($($stat)*)) }} +#[macro_export] +macro_rules! r4096 { ($($stat:stmt)*) => { r2!(r2048!($($stat)*)) }} +#[macro_export] +macro_rules! r8192 { ($($stat:stmt)*) => { r2!(r4096!($($stat)*)) }} + +#[macro_export] +macro_rules! b1 { ($($stat:stmt)*) => { nano::run_bench(|| { r1!($($stat)*); }) / 1.0 } } +#[macro_export] +macro_rules! b2 { ($($stat:stmt)*) => { nano::run_bench(|| { r2!($($stat)*); }) / 2.0 } } +#[macro_export] +macro_rules! b4 { ($($stat:stmt)*) => { nano::run_bench(|| { r4!($($stat)*); }) / 4.0 } } +#[macro_export] +macro_rules! b8 { ($($stat:stmt)*) => { nano::run_bench(|| { r8!($($stat)*); }) / 8.0 } } +#[macro_export] +macro_rules! b16 { ($($stat:stmt)*) => { nano::run_bench(|| { r16!($($stat)*); }) / 16.0 } } +#[macro_export] +macro_rules! b32 { ($($stat:stmt)*) => { nano::run_bench(|| { r32!($($stat)*); }) / 32.0 } } +#[macro_export] +macro_rules! b64 { ($($stat:stmt)*) => { nano::run_bench(|| { r64!($($stat)*); }) / 64.0 } } +#[macro_export] +macro_rules! b128 { ($($stat:stmt)*) => { nano::run_bench(|| { r128!($($stat)*); }) / 128.0 } } +#[macro_export] +macro_rules! b256 { ($($stat:stmt)*) => { nano::run_bench(|| { r256!($($stat)*); }) / 256.0 } } +#[macro_export] +macro_rules! b512 { ($($stat:stmt)*) => { nano::run_bench(|| { r512!($($stat)*); }) / 512.0 } } +#[macro_export] +macro_rules! b1024 { ($($stat:stmt)*) => { nano::run_bench(|| { r1024!($($stat)*); }) / 1024.0 } } +#[macro_export] +macro_rules! b2048 { ($($stat:stmt)*) => { nano::run_bench(|| { r2048!($($stat)*); }) / 2048.0 } } +#[macro_export] +macro_rules! b4096 { ($($stat:stmt)*) => { nano::run_bench(|| { r4096!($($stat)*); }) / 4096.0 } } +#[macro_export] +macro_rules! b8192 { ($($stat:stmt)*) => { nano::run_bench(|| { r8192!($($stat)*); }) / 8192.0 } } + +#[inline] +fn black_box(dummy: T) -> T { + unsafe { + let ret = std::ptr::read_volatile(&dummy); + std::mem::forget(dummy); + ret + } +} + +pub fn run_bench T + Copy>(f: F) -> f64 { + let start = Instant::now(); + let mut f = black_box(f); + black_box(f(1)); + let once = start.elapsed(); + let evaled = if once < Duration::from_millis(1) { + let start = Instant::now(); + black_box(f)(1000); + start.elapsed().as_secs_f64() / 1000. + } else { + once.as_secs_f64() + }; + // raw evaluation is over a second. stop right there + if evaled > 1.0 { + return evaled; + } + + // we want each individual sample to run for no less than + let minimum_sampling_time_s = 0.01; + let minimum_samples = 25; + let desired_bench_time = 1.0; + + let inner_loops = (minimum_sampling_time_s / evaled).max(1.0) as usize; + + let samples = + ((desired_bench_time / (inner_loops as f64 * evaled)) as usize).max(minimum_samples); + let warmup = (1.0 / evaled) as usize; + + // println!( + // "evaled: {:?} samples:{samples} inner_loops:{inner_loops} time:{}", + // Duration::from_secs_f64(evaled), + // (samples * inner_loops) as f64 * evaled + // ); + let mut measures = vec![0.0; samples]; + + black_box(f(warmup)); + for m in &mut measures { + let start = Instant::now(); + black_box(black_box(f))(inner_loops); + let time = start.elapsed().as_secs_f64(); + *m = time / inner_loops as f64 + } + measures + .sort_by(|a, b| if a < b { std::cmp::Ordering::Less } else { std::cmp::Ordering::Greater }); + let q1 = measures[samples / 4]; + let q3 = measures[samples - samples / 4]; + let iq = q3 - q1; + measures.retain(|&x| x >= q1 - 3. * iq && x <= q3 + 3. * iq); + measures.iter().copied().sum::() / measures.len() as f64 +} diff --git a/vendor/tract-linalg-0.22.1/src/lib.rs b/vendor/tract-linalg-0.22.1/src/lib.rs new file mode 100644 index 000000000..1af6f78b7 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/lib.rs @@ -0,0 +1,404 @@ +#![allow(clippy::missing_safety_doc)] +#![allow(clippy::redundant_closure_call)] +#![allow(clippy::len_zero)] +#![allow(clippy::excessive_precision)] +#![allow(clippy::approx_constant)] +#![allow(clippy::manual_is_multiple_of)] +#![allow(unexpected_cfgs)] +#![allow(unused_macros)] +#[macro_use] +extern crate derive_new; +extern crate lazy_static; +extern crate log; +extern crate num_traits; +#[macro_use] +extern crate pastey; +#[cfg(test)] +extern crate proptest; + +include!(concat!(env!("OUT_DIR"), "/extern_kernel_macro.rs")); + +#[macro_use] +mod frame; +pub mod generic; +pub mod multithread; +pub use frame::weights::WeightType; +pub use generic::{ScaleShiftAndRound, Scaler}; +use lazy_static::lazy_static; +use mmm::{MMMInputFormat, MatMatMul, PanelExtractor}; +use tract_data::internal::TensorView; +#[cfg(target_arch = "x86_64")] +pub mod x86_64_fma; + +pub mod hwbench; + +#[cfg(target_arch = "aarch64")] +pub mod arm64; + +#[cfg(target_arch = "aarch64")] +pub use arm64::has_fp16; +use tract_itertools::Itertools; + +#[cfg(not(target_arch = "aarch64"))] +pub fn has_fp16() -> bool { + false +} + +#[cfg(any(target_arch = "arm", target_arch = "armv7", target_arch = "arm"))] +pub mod arm32; + +#[cfg(all(target_family = "wasm", target_feature = "simd128"))] +pub mod wasm; + +pub use self::frame::*; + +use tract_data::prelude::*; + +pub type MMMImpl = Box< + dyn Fn(Option, Option, Option) -> Box + Send + Sync, +>; + +type MMVImpl = Box, Option) -> Box + Send + Sync>; + +#[allow(clippy::type_complexity)] +pub struct Ops { + mmm_impls: Vec>, + panel_extractors: Vec, + + mmm_f64: MMMImpl, + mmv_f64: MMVImpl, + + mmm_f32: MMMImpl, + mmv_f32: MMVImpl, + + mmm_f16: MMMImpl, + mmv_f16: MMVImpl, + + qmmm_i32: MMMImpl, + qmmv_i32: MMVImpl, + + pub leaky_relu_f16: Box Box> + Send + Sync>, + pub leaky_relu_f32: Box Box> + Send + Sync>, + pub mul_by_scalar_f32: + Box Box> + Send + Sync>, + pub mul_by_scalar_f16: + Box Box> + Send + Sync>, + + pub sigmoid_f16: Box Box> + Send + Sync>, + pub sigmoid_f32: Box Box> + Send + Sync>, + pub tanh_f16: Box Box> + Send + Sync>, + pub tanh_f32: Box Box> + Send + Sync>, + pub erf_f32: Box Box> + Send + Sync>, + pub lut_u8: Box Box + Send + Sync>, + + pub max_f16: Box Box> + Send + Sync>, + pub max_f32: Box Box> + Send + Sync>, + + pub sum_f16: Box Box> + Send + Sync>, + pub sum_f32: Box Box> + Send + Sync>, + + pub softmax2_fastcompact_f16: + Box Box> + Send + Sync>, + pub softmax2_fastcompact_f32: + Box Box> + Send + Sync>, +} + +impl Ops { + pub fn mmm_impls(&self) -> &[Box] { + &self.mmm_impls + } + + pub fn all_possible_packing( + &self, + weight_type: impl Into, + ) -> impl Iterator { + let weight_type = weight_type.into(); + self.mmm_impls + .iter() + .flat_map(|m| m.packings()) + .map(|p| &*p.0) + .flat_map(move |p| { + let mut packs: Vec<&dyn MMMInputFormat> = vec![]; + if p.precursor() == weight_type { + packs.push(p) + }; + for pe in &self.panel_extractors { + if pe.from.precursor() == weight_type && pe.to.same_as(p) { + packs.push(&*pe.from); + } + } + packs.into_iter() + }) + .sorted_by_key(|p| p.to_string()) + .dedup() + } + + pub fn filter_impls<'o>( + &'o self, + weight: &'o dyn MMMInputFormat, + acc: &[DatumType], + act: DatumType, + store: DatumType, + ) -> impl Iterator< + Item = ( + &'o dyn MatMatMul, + usize, + &'o dyn MMMInputFormat, + Option<&'o PanelExtractor>, + &'o dyn MMMInputFormat, + ), + > { + let acc = acc.to_vec(); + self.mmm_impls + .iter() + .filter(move |mmm| acc.contains(&mmm.internal_type()) && mmm.stores().contains(&store)) + .flat_map(|mmm| { + mmm.packings() + .iter() + .enumerate() + .map(|(pack_ix, (a, b))| (&**mmm, pack_ix, &**a, &**b)) + }) + .filter_map(|(mmm, ix, a, b)| { + if a.same_as(weight) { + Some((mmm, ix, a, None, b)) + } else { + self.panel_extractors + .iter() + .find(|pe| pe.from.same_as(weight) && pe.to.same_as(a)) + .map(|pe| (mmm, ix, a, Some(pe), b)) + } + }) + .filter(move |(_mmm, _ix, _a, _pe, b)| { + b.precursor().as_dt().is_some_and(|dt| dt == act) + }) + } + + pub fn panel_extractors(&self) -> &[mmm::panel_extract::PanelExtractor] { + &self.panel_extractors + } + + pub fn mmm( + &self, + accumulator: DatumType, + m: Option, + k: Option, + n: Option, + ) -> Option> { + use DatumType::*; + match accumulator { + F64 => Some(if n == Some(1) { (self.mmv_f64)(m, k) } else { (self.mmm_f64)(m, k, n) }), + F32 => Some(if n == Some(1) { (self.mmv_f32)(m, k) } else { (self.mmm_f32)(m, k, n) }), + F16 => Some(if n == Some(1) { (self.mmv_f16)(m, k) } else { (self.mmm_f16)(m, k, n) }), + I32 => { + Some(if n == Some(1) { (self.qmmv_i32)(m, k) } else { (self.qmmm_i32)(m, k, n) }) + } + _ => None, + } + } +} + +pub fn generic() -> Ops { + use crate::generic::mmm::*; + use element_wise::ElementWiseKer; + use reduce::{MapReduceKer, ReduceKer}; + let mut ops = Ops { + mmm_impls: vec![], + panel_extractors: vec![], + mmm_f64: Box::new(|_, _, _| generic_f64_4x4.mmm()), + mmv_f64: Box::new(|_, _| generic_f64_4x1.mmm()), + mmm_f32: Box::new(|_, _, _| generic_f32_4x4.mmm()), + mmv_f32: Box::new(|_, _| generic_f32_4x1.mmm()), + mmm_f16: Box::new(|_, _, _| generic_f16_4x4.mmm()), + mmv_f16: Box::new(|_, _| generic_f16_4x1.mmm()), + qmmm_i32: Box::new(|_, _, _| generic_i32_4x4.mmm()), + qmmv_i32: Box::new(|_, _| generic_i32_4x4.mmm()), + leaky_relu_f16: Box::new(|| generic::HLeakyRelu8::ew()), + leaky_relu_f32: Box::new(|| generic::SLeakyRelu4::ew()), + mul_by_scalar_f16: Box::new(|| generic::HMulByScalar8::ew()), + mul_by_scalar_f32: Box::new(|| generic::SMulByScalar4::ew()), + sigmoid_f16: Box::new(|| generic::HSigmoid8::ew()), + sigmoid_f32: Box::new(|| generic::SSigmoid4::ew()), + tanh_f16: Box::new(|| generic::HTanh8::ew()), + tanh_f32: Box::new(|| generic::STanh4::ew()), + erf_f32: Box::new(|| generic::SErf4::ew()), + lut_u8: Box::new(|table: &[u8]| Box::new(lut::LutImpl::::new(table))), + max_f16: Box::new(|| generic::reduce::max::HMax8::red()), + max_f32: Box::new(|| generic::reduce::max::SMax4::red()), + sum_f16: Box::new(|| generic::reduce::sum::HSum8::red()), + sum_f32: Box::new(|| generic::reduce::sum::SSum4::red()), + /* + activation_f32: Box::new(|microcode| generic::SActivation::new(microcode)) + */ + softmax2_fastcompact_f16: Box::new(|| generic::reduce::softmax_l2::HSoftMaxL2::red()), + softmax2_fastcompact_f32: Box::new(|| generic::reduce::softmax_l2::SSoftMaxL2::red()), + }; + crate::generic::mmm::plug(&mut ops); + ops +} + +#[allow(unreachable_code, unused_mut, unexpected_cfgs)] +pub fn best() -> Ops { + let mut ops = generic(); + #[cfg(target_arch = "x86_64")] + x86_64_fma::plug(&mut ops); + #[cfg(any(target_arch = "arm", target_arch = "armv7"))] + arm32::plug(&mut ops); + #[cfg(target_arch = "aarch64")] + arm64::plug(&mut ops); + #[cfg(all(target_family = "wasm", target_feature = "simd128"))] + wasm::plug(&mut ops); + + ops +} + +lazy_static::lazy_static! { + static ref OPS: Ops = { + best() + }; +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum BinOp { + Min, + Max, + Add, + Mul, + Sub, + SubF, +} + +impl BinOp { + pub fn flip(&self) -> BinOp { + use BinOp::*; + match self { + Sub => SubF, + SubF => Sub, + sym => *sym, + } + } +} + +fn register_all_unicast(registry: &mut LinalgRegistry) { + generic::register_all_unicast(registry); + #[cfg(target_arch = "aarch64")] + arm64::register_all_unicast(registry); +} + +fn register_all_by_scalar(registry: &mut LinalgRegistry) { + generic::register_all_by_scalar(registry); + #[cfg(target_arch = "aarch64")] + arm64::register_all_by_scalar(registry); +} + +pub type LinalgFn = dyn Fn(&mut TensorView, &TensorView) -> TractResult<()> + Send + Sync; +type LinalgRegistry = HashMap<(BinOp, DatumType), Box Box + Send + Sync>>; +lazy_static! { + static ref BIN_UNICAST_OPS: Mutex = { + let mut registry = HashMap::default(); + register_all_unicast(&mut registry); + Mutex::new(registry) + }; + static ref BIN_BY_SCALAR_OPS: Mutex = { + let mut registry = HashMap::default(); + register_all_by_scalar(&mut registry); + Mutex::new(registry) + }; +} + +pub fn bin_by_scalar(dt: DatumType, bin: BinOp) -> Option> { + let map = BIN_BY_SCALAR_OPS.lock().unwrap(); + if (dt == DatumType::F16) && !has_fp16() { + return None; + } + map.get(&(bin, dt)).map(|it| (it)()) +} + +pub fn bin_unicast(dt: DatumType, bin: BinOp) -> Option> { + let map = BIN_UNICAST_OPS.lock().unwrap(); + if (dt == DatumType::F16) && !has_fp16() { + return None; + } + map.get(&(bin, dt)).map(|it| (it)()) +} + +pub fn ops() -> &'static Ops { + &OPS +} + +use num_traits::*; +use std::collections::HashMap; +use std::fmt::Debug; +use std::ops::*; +use std::sync::Mutex; + +pub trait LADatum: + Sized + + std::fmt::Display + + Debug + + Copy + + Clone + + Zero + + One + + 'static + + Add + + Sub + + Mul + + AddAssign + + PartialOrd + + Bounded + + tract_data::prelude::Datum +{ + #[cfg(test)] + fn strat() -> proptest::prelude::BoxedStrategy; +} + +#[cfg(test)] +use proptest::prelude::*; + +impl LADatum for f16 { + #[cfg(test)] + fn strat() -> BoxedStrategy { + f32::strat().prop_map(|f| f.as_()).boxed() + } +} + +impl LADatum for f32 { + #[cfg(test)] + fn strat() -> BoxedStrategy { + (-1000isize..1000).prop_map(|i| i as f32 / 1000.0).boxed() + } +} + +impl LADatum for f64 { + #[cfg(test)] + fn strat() -> BoxedStrategy { + (-1000isize..1000).prop_map(|i| i as f64 / 1000.0).boxed() + } +} + +impl LADatum for u8 { + #[cfg(test)] + fn strat() -> BoxedStrategy { + any::().boxed() + } +} + +impl LADatum for i8 { + #[cfg(test)] + fn strat() -> BoxedStrategy { + any::().boxed() + } +} + +impl LADatum for i32 { + #[cfg(test)] + fn strat() -> BoxedStrategy { + any::().boxed() + } +} + +#[cfg(test)] +#[allow(dead_code)] +fn setup_test_logger() { + let _ = env_logger::Builder::from_env("TRACT_LOG").try_init(); +} diff --git a/vendor/tract-linalg-0.22.1/src/multithread.rs b/vendor/tract-linalg-0.22.1/src/multithread.rs new file mode 100644 index 000000000..51f2f07c0 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/multithread.rs @@ -0,0 +1,57 @@ +use std::cell::RefCell; +#[allow(unused_imports)] +use std::sync::{Arc, Mutex}; + +#[cfg(feature = "multithread-mm")] +use rayon::{ThreadPool, ThreadPoolBuilder}; + +#[derive(Debug, Clone, Default)] +pub enum Executor { + #[default] + SingleThread, + #[cfg(feature = "multithread-mm")] + MultiThread(Arc), +} + +impl Executor { + #[cfg(feature = "multithread-mm")] + pub fn multithread(n: usize) -> Executor { + Executor::multithread_with_name(n, "tract-default") + } + + #[cfg(feature = "multithread-mm")] + pub fn multithread_with_name(n: usize, name: &str) -> Executor { + let name = name.to_string(); + let pool = ThreadPoolBuilder::new() + .thread_name(move |n| format!("{name}-{n}")) + .num_threads(n) + .build() + .unwrap(); + Executor::MultiThread(Arc::new(pool)) + } +} + +static DEFAULT_EXECUTOR: Mutex = Mutex::new(Executor::SingleThread); + +thread_local! { + static TLS_EXECUTOR_OVERRIDE: RefCell> = Default::default(); +} + +pub fn current_tract_executor() -> Executor { + if let Some(over_ride) = TLS_EXECUTOR_OVERRIDE.with_borrow(|tls| tls.clone()) { + over_ride + } else { + DEFAULT_EXECUTOR.lock().unwrap().clone() + } +} + +pub fn set_default_executor(executor: Executor) { + *DEFAULT_EXECUTOR.lock().unwrap() = executor; +} + +pub fn multithread_tract_scope R>(pool: Executor, f: F) -> R { + let previous = TLS_EXECUTOR_OVERRIDE.replace(Some(pool)); + let result = f(); + TLS_EXECUTOR_OVERRIDE.set(previous); + result +} diff --git a/vendor/tract-linalg-0.22.1/src/wasm.rs b/vendor/tract-linalg-0.22.1/src/wasm.rs new file mode 100644 index 000000000..628fc720c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/wasm.rs @@ -0,0 +1,1664 @@ +/// Wasm SIMD implementation of `MatMatMulKer` +/// +/// To run test, you need to install `wasmtime` +/// and export the following environment variables: +/// ``` +/// > export RUSTFLAGS='-C target-feature=+simd128' +/// > export CARGO_TARGET_WASM32_WASI_RUNNER=wasmtime +/// > cargo test --target=wasm32-wasi +/// ``` +use crate::mmm::FusedKerSpec; +use crate::mmm::ImplementationQuality; +use crate::{Ops, Scaler}; + +pub fn plug(ops: &mut Ops) { + ops.mmm_impls.push(wasm_f32_4x4.mmm()); + ops.mmm_impls.push(wasm_f32_4x1.mmm()); + ops.mmm_impls.push(wasm_f32_8x1.mmm()); + ops.mmm_impls.push(wasm_f32_16x1.mmm()); + ops.mmm_impls.push(wasm_f32_8x8.mmm()); + // Selection: max(nr*mr) for N>1, max(mr) for N=1. + // - N>1 ops: 8x8 (nr*mr=64) wins over 4x4 (16) + // - N=1 ops: 16x1 (mr=16) wins + ops.mmm_f32 = Box::new(|_m, _k, _n| wasm_f32_8x8.mmm()); + ops.mmv_f32 = Box::new(|m, _k| match m.unwrap_or(0) { + 0..=7 => wasm_f32_4x1.mmm(), + 8..=15 => wasm_f32_8x1.mmm(), + _ => wasm_f32_16x1.mmm(), + }); +} + +unsafe fn kernel_f32_4x4(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + unsafe { + // Each of these variables stores a row of the matrix, + // consisting of four packed `f32` numbers. + let mut ab0 = f32x4_splat(0.0); + let mut ab1 = f32x4_splat(0.0); + let mut ab2 = f32x4_splat(0.0); + let mut ab3 = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + let a = f32x4_splat(0.0); + ab0 = a; + ab1 = a; + ab2 = a; + ab3 = a; + } + FusedKerSpec::LoadTile(_cols, rows) => { + let rows = rows as *const v128; + ab0 = *rows; + ab1 = *rows.add(1); + ab2 = *rows.add(2); + ab3 = *rows.add(3); + } + FusedKerSpec::ScalarMin(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_min(a, ab0); + ab1 = f32x4_min(a, ab1); + ab2 = f32x4_min(a, ab2); + ab3 = f32x4_min(a, ab3); + } + FusedKerSpec::ScalarMax(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_max(a, ab0); + ab1 = f32x4_max(a, ab1); + ab2 = f32x4_max(a, ab2); + ab3 = f32x4_max(a, ab3); + } + FusedKerSpec::ScalarAdd(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_add(a, ab0); + ab1 = f32x4_add(a, ab1); + ab2 = f32x4_add(a, ab2); + ab3 = f32x4_add(a, ab3); + } + FusedKerSpec::ScalarMul(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_mul(a, ab0); + ab1 = f32x4_mul(a, ab1); + ab2 = f32x4_mul(a, ab2); + ab3 = f32x4_mul(a, ab3); + } + FusedKerSpec::ScalarSub(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_sub(a, ab0); + ab1 = f32x4_sub(a, ab1); + ab2 = f32x4_sub(a, ab2); + ab3 = f32x4_sub(a, ab3); + } + FusedKerSpec::ScalarSubF(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_sub(ab0, a); + ab1 = f32x4_sub(ab1, a); + ab2 = f32x4_sub(ab2, a); + ab3 = f32x4_sub(ab3, a); + } + FusedKerSpec::LeakyRelu(a) => { + let a = f32x4_splat(a); + let zero = f32x4_splat(0.0); + + let mask0 = f32x4_gt(ab0, zero); + ab0 = v128_bitselect(ab0, f32x4_mul(a, ab0), mask0); + + let mask1 = f32x4_gt(ab1, zero); + ab1 = v128_bitselect(ab1, f32x4_mul(a, ab1), mask1); + + let mask2 = f32x4_gt(ab2, zero); + ab2 = v128_bitselect(ab2, f32x4_mul(a, ab2), mask2); + + let mask3 = f32x4_gt(ab3, zero); + ab3 = v128_bitselect(ab3, f32x4_mul(a, ab3), mask3); + } + FusedKerSpec::PerRowMin(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_min(f32x4_splat(row[0]), ab0); + ab1 = f32x4_min(f32x4_splat(row[1]), ab1); + ab2 = f32x4_min(f32x4_splat(row[2]), ab2); + ab3 = f32x4_min(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowMax(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_max(f32x4_splat(row[0]), ab0); + ab1 = f32x4_max(f32x4_splat(row[1]), ab1); + ab2 = f32x4_max(f32x4_splat(row[2]), ab2); + ab3 = f32x4_max(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowAdd(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_add(f32x4_splat(row[0]), ab0); + ab1 = f32x4_add(f32x4_splat(row[1]), ab1); + ab2 = f32x4_add(f32x4_splat(row[2]), ab2); + ab3 = f32x4_add(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowMul(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_mul(f32x4_splat(row[0]), ab0); + ab1 = f32x4_mul(f32x4_splat(row[1]), ab1); + ab2 = f32x4_mul(f32x4_splat(row[2]), ab2); + ab3 = f32x4_mul(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowSub(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_sub(f32x4_splat(row[0]), ab0); + ab1 = f32x4_sub(f32x4_splat(row[1]), ab1); + ab2 = f32x4_sub(f32x4_splat(row[2]), ab2); + ab3 = f32x4_sub(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowSubF(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_sub(ab0, f32x4_splat(row[0])); + ab1 = f32x4_sub(ab1, f32x4_splat(row[1])); + ab2 = f32x4_sub(ab2, f32x4_splat(row[2])); + ab3 = f32x4_sub(ab3, f32x4_splat(row[3])); + } + FusedKerSpec::PerColMin(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_min(cols, ab0); + ab1 = f32x4_min(cols, ab1); + ab2 = f32x4_min(cols, ab2); + ab3 = f32x4_min(cols, ab3); + } + FusedKerSpec::PerColMax(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_max(cols, ab0); + ab1 = f32x4_max(cols, ab1); + ab2 = f32x4_max(cols, ab2); + ab3 = f32x4_max(cols, ab3); + } + FusedKerSpec::PerColAdd(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_add(cols, ab0); + ab1 = f32x4_add(cols, ab1); + ab2 = f32x4_add(cols, ab2); + ab3 = f32x4_add(cols, ab3); + } + FusedKerSpec::PerColMul(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_mul(cols, ab0); + ab1 = f32x4_mul(cols, ab1); + ab2 = f32x4_mul(cols, ab2); + ab3 = f32x4_mul(cols, ab3); + } + FusedKerSpec::PerColSub(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_sub(cols, ab0); + ab1 = f32x4_sub(cols, ab1); + ab2 = f32x4_sub(cols, ab2); + ab3 = f32x4_sub(cols, ab3); + } + FusedKerSpec::PerColSubF(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_sub(ab0, cols); + ab1 = f32x4_sub(ab1, cols); + ab2 = f32x4_sub(ab2, cols); + ab3 = f32x4_sub(ab3, cols); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + let scale = f32x4_splat(scaler.scale); + ab0 = f32x4_mul(scale, ab0); + ab1 = f32x4_mul(scale, ab1); + ab2 = f32x4_mul(scale, ab2); + ab3 = f32x4_mul(scale, ab3); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let shift = f32x4_splat(2f32.powi(-(shift as i32))); + ab0 = f32x4_mul(shift, ab0); + ab1 = f32x4_mul(shift, ab1); + ab2 = f32x4_mul(shift, ab2); + ab3 = f32x4_mul(shift, ab3); + } + FusedKerSpec::ShiftLeft(shift) => { + let shift = f32x4_splat(2f32.powi(shift as i32)); + ab0 = f32x4_mul(shift, ab0); + ab1 = f32x4_mul(shift, ab1); + ab2 = f32x4_mul(shift, ab2); + ab3 = f32x4_mul(shift, ab3); + } + FusedKerSpec::AddUnicast(tile) => { + let mut ptr: *const u8 = tile.ptr; + + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + ab0 = f32x4_add(ab0, f32x4(m0, m1, m2, m3)); + ptr = ptr.add(tile.row_byte_stride as usize); + + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + ab1 = f32x4_add(ab1, f32x4(m0, m1, m2, m3)); + ptr = ptr.add(tile.row_byte_stride as usize); + + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + ab2 = f32x4_add(ab2, f32x4(m0, m1, m2, m3)); + ptr = ptr.add(tile.row_byte_stride as usize); + + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + ab3 = f32x4_add(ab3, f32x4(m0, m1, m2, m3)); + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(*rows.add(0)), cols)); + ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(*rows.add(1)), cols)); + ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(*rows.add(2)), cols)); + ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(*rows.add(3)), cols)); + } + FusedKerSpec::Store(tile) => { + let mut ptr: *mut u8 = tile.ptr; + + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab0); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab0); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = + f32x4_extract_lane::<2>(ab0); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = + f32x4_extract_lane::<3>(ab0); + ptr = ptr.add(tile.row_byte_stride as usize); + + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab1); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab1); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = + f32x4_extract_lane::<2>(ab1); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = + f32x4_extract_lane::<3>(ab1); + ptr = ptr.add(tile.row_byte_stride as usize); + + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab2); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab2); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = + f32x4_extract_lane::<2>(ab2); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = + f32x4_extract_lane::<3>(ab2); + ptr = ptr.add(tile.row_byte_stride as usize); + + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab3); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab3); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = + f32x4_extract_lane::<2>(ab3); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = + f32x4_extract_lane::<3>(ab3); + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + let a = pa as *const f32; + let b = pb as *const v128; + for i in 0..k { + let a = std::slice::from_raw_parts(a.offset(4 * i as isize), 4); + let b = v128_load(b.offset(i as isize)); + ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(a[0]), b)); + ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(a[1]), b)); + ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(a[2]), b)); + ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(a[3]), b)); + } + } + } + pnl = pnl.add(1); + } + 0 + } +} + +MMMRustKernel!(kernel_f32_4x4 => wasm_f32_4x4(4,4)@(4,4) quality(ImplementationQuality::TargetOptimized)); + +/// WASM SIMD f32 4x1 kernel — GEMV-shaped variant for matrix-vector products +/// (single-column outputs, e.g., streaming-RNN inference where each frame's +/// activation is a single column). Mirrors the 4x4 kernel's FusedKerSpec +/// match arms but collapses the column dimension from 4 to 1: a single +/// f32x4 accumulator holds 4 output rows × 1 output column packed as +/// [ab[0], ab[1], ab[2], ab[3]]. +/// +/// Selection: tract-core's einsum kernel_selection::strategize() prefers +/// kernels with nr() == 1 when op.n.is_one(), so this kernel is +/// automatically picked for N=1 cases once registered. +unsafe fn kernel_f32_4x1(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + unsafe { + // Single accumulator: 4 rows × 1 col, packed into one f32x4. + // lane[i] holds ab[i] = the output value for row i (col 0). + let mut ab = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + ab = f32x4_splat(0.0); + } + FusedKerSpec::LoadTile(_cols, rows) => { + // Tile is 4 rows × 1 col = 4 contiguous f32s = 1 v128 + ab = v128_load(rows as *const v128); + } + FusedKerSpec::ScalarMin(a) => { + ab = f32x4_min(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarMax(a) => { + ab = f32x4_max(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarAdd(a) => { + ab = f32x4_add(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarMul(a) => { + ab = f32x4_mul(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarSub(a) => { + ab = f32x4_sub(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarSubF(a) => { + ab = f32x4_sub(ab, f32x4_splat(a)); + } + FusedKerSpec::LeakyRelu(a) => { + let zero = f32x4_splat(0.0); + let mask = f32x4_gt(ab, zero); + ab = v128_bitselect(ab, f32x4_mul(f32x4_splat(a), ab), mask); + } + FusedKerSpec::PerRowMin(row) => { + // 4 row values, applied to ab's 4 lanes in order + let r = v128_load(row as *const v128); + ab = f32x4_min(r, ab); + } + FusedKerSpec::PerRowMax(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_max(r, ab); + } + FusedKerSpec::PerRowAdd(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_add(r, ab); + } + FusedKerSpec::PerRowMul(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_mul(r, ab); + } + FusedKerSpec::PerRowSub(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_sub(r, ab); + } + FusedKerSpec::PerRowSubF(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_sub(ab, r); + } + FusedKerSpec::PerColMin(cols) => { + // Single col value broadcast to all 4 rows + ab = f32x4_min(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColMax(cols) => { + ab = f32x4_max(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColAdd(cols) => { + ab = f32x4_add(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColMul(cols) => { + ab = f32x4_mul(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColSub(cols) => { + ab = f32x4_sub(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColSubF(cols) => { + ab = f32x4_sub(ab, f32x4_splat(*cols)); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + ab = f32x4_mul(f32x4_splat(scaler.scale), ab); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let s = f32x4_splat(2f32.powi(-(shift as i32))); + ab = f32x4_mul(s, ab); + } + FusedKerSpec::ShiftLeft(shift) => { + let s = f32x4_splat(2f32.powi(shift as i32)); + ab = f32x4_mul(s, ab); + } + FusedKerSpec::AddUnicast(tile) => { + // 4 rows × 1 col, with row_byte_stride between rows (col_stride irrelevant for N=1) + let mut ptr: *const u8 = tile.ptr; + let m0 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m1 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m2 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m3 = *(ptr as *const f32); + ab = f32x4_add(ab, f32x4(m0, m1, m2, m3)); + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + // ab[i] += rows[i] * cols[0] (cols[0] is the single col) + let r = v128_load(rows as *const v128); + let c = f32x4_splat(*cols); + ab = f32x4_add(ab, f32x4_mul(r, c)); + } + FusedKerSpec::Store(tile) => { + // 4 rows × 1 col, write each lane to a separate row + let mut ptr: *mut u8 = tile.ptr; + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab); + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + // A is packed [k][MR=4]: each k iter loads 4 contiguous f32s = 1 v128. + // B is packed [k][NR=1]: each k iter loads 1 scalar f32, broadcast. + // ab[i] += a[i] * b for all i in 0..4 → SIMD: ab += a_vec * b_splat + let a = pa as *const v128; + let b = pb as *const f32; + for i in 0..k { + let a_vec = v128_load(a.offset(i as isize)); + let b_splat = f32x4_splat(*b.offset(i as isize)); + ab = f32x4_add(ab, f32x4_mul(a_vec, b_splat)); + } + } + } + pnl = pnl.add(1); + } + 0 + } +} + +MMMRustKernel!(kernel_f32_4x1 => wasm_f32_4x1(4,1)@(4,1) quality(ImplementationQuality::TargetOptimized)); + +/// WASM SIMD f32 8x1 kernel — wider GEMV variant for matrix-vector products +/// on large M. Uses TWO independent f32x4 accumulators (rows 0-3 in ab_top, +/// rows 4-7 in ab_bot), enabling 2-way ILP within each k-iteration: +/// the inner loop issues two independent f32x4_add(f32x4_mul(...)) ops per +/// k-step, breaking the data-dependency chain depth from K to ~K/2 at the +/// hardware pipeline level. +/// +/// Compared to wasm_f32_4x1 (1 accumulator, k-serial dep chain), this is +/// targeted at GEMV ops where M is a multiple of 8 (or close to it). For +/// M=256 GRU gate matmuls (the dominant GEMV in DFN3), this should yield +/// ~2x speedup on the inner loop on hardware where SIMD FMLA throughput +/// exceeds 1 op/cycle. +/// +/// Selection: `kernel_selection::strategize()` prefers max mr() for n=1 +/// cases, so this kernel automatically wins over wasm_f32_4x1 for all N=1 +/// ops once registered (including small-M cases where it slightly wastes +/// rows — for M=1 lsnr_fc-style ops, that's 7-of-8 row waste, but those +/// ops are <1% of frame so the regression is noise). +unsafe fn kernel_f32_8x1(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + unsafe { + // Two accumulators: 8 rows × 1 col packed as [ab_top, ab_bot] + // ab_top.lane[i] holds row i (i in 0..4); ab_bot.lane[i] holds row i+4 + let mut ab_top = f32x4_splat(0.0); + let mut ab_bot = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + ab_top = f32x4_splat(0.0); + ab_bot = f32x4_splat(0.0); + } + FusedKerSpec::LoadTile(_cols, rows) => { + // 8 rows × 1 col = 8 contiguous f32 = 2 v128 + let p = rows as *const v128; + ab_top = *p; + ab_bot = *p.add(1); + } + FusedKerSpec::ScalarMin(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_min(s, ab_top); + ab_bot = f32x4_min(s, ab_bot); + } + FusedKerSpec::ScalarMax(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_max(s, ab_top); + ab_bot = f32x4_max(s, ab_bot); + } + FusedKerSpec::ScalarAdd(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_add(s, ab_top); + ab_bot = f32x4_add(s, ab_bot); + } + FusedKerSpec::ScalarMul(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_mul(s, ab_top); + ab_bot = f32x4_mul(s, ab_bot); + } + FusedKerSpec::ScalarSub(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_sub(s, ab_top); + ab_bot = f32x4_sub(s, ab_bot); + } + FusedKerSpec::ScalarSubF(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_sub(ab_top, s); + ab_bot = f32x4_sub(ab_bot, s); + } + FusedKerSpec::LeakyRelu(a) => { + let s = f32x4_splat(a); + let zero = f32x4_splat(0.0); + let mask_t = f32x4_gt(ab_top, zero); + let mask_b = f32x4_gt(ab_bot, zero); + ab_top = v128_bitselect(ab_top, f32x4_mul(s, ab_top), mask_t); + ab_bot = v128_bitselect(ab_bot, f32x4_mul(s, ab_bot), mask_b); + } + FusedKerSpec::PerRowMin(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_min(r_t, ab_top); + ab_bot = f32x4_min(r_b, ab_bot); + } + FusedKerSpec::PerRowMax(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_max(r_t, ab_top); + ab_bot = f32x4_max(r_b, ab_bot); + } + FusedKerSpec::PerRowAdd(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_add(r_t, ab_top); + ab_bot = f32x4_add(r_b, ab_bot); + } + FusedKerSpec::PerRowMul(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_mul(r_t, ab_top); + ab_bot = f32x4_mul(r_b, ab_bot); + } + FusedKerSpec::PerRowSub(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_sub(r_t, ab_top); + ab_bot = f32x4_sub(r_b, ab_bot); + } + FusedKerSpec::PerRowSubF(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_sub(ab_top, r_t); + ab_bot = f32x4_sub(ab_bot, r_b); + } + FusedKerSpec::PerColMin(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_min(c, ab_top); + ab_bot = f32x4_min(c, ab_bot); + } + FusedKerSpec::PerColMax(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_max(c, ab_top); + ab_bot = f32x4_max(c, ab_bot); + } + FusedKerSpec::PerColAdd(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_add(c, ab_top); + ab_bot = f32x4_add(c, ab_bot); + } + FusedKerSpec::PerColMul(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_mul(c, ab_top); + ab_bot = f32x4_mul(c, ab_bot); + } + FusedKerSpec::PerColSub(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_sub(c, ab_top); + ab_bot = f32x4_sub(c, ab_bot); + } + FusedKerSpec::PerColSubF(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_sub(ab_top, c); + ab_bot = f32x4_sub(ab_bot, c); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + let s = f32x4_splat(scaler.scale); + ab_top = f32x4_mul(s, ab_top); + ab_bot = f32x4_mul(s, ab_bot); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let s = f32x4_splat(2f32.powi(-(shift as i32))); + ab_top = f32x4_mul(s, ab_top); + ab_bot = f32x4_mul(s, ab_bot); + } + FusedKerSpec::ShiftLeft(shift) => { + let s = f32x4_splat(2f32.powi(shift as i32)); + ab_top = f32x4_mul(s, ab_top); + ab_bot = f32x4_mul(s, ab_bot); + } + FusedKerSpec::AddUnicast(tile) => { + // 8 rows × 1 col, stride is row_byte_stride between rows + let mut ptr: *const u8 = tile.ptr; + let m0 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m1 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m2 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m3 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m4 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m5 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m6 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m7 = *(ptr as *const f32); + ab_top = f32x4_add(ab_top, f32x4(m0, m1, m2, m3)); + ab_bot = f32x4_add(ab_bot, f32x4(m4, m5, m6, m7)); + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + let p = rows as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + let c = f32x4_splat(*cols); + ab_top = f32x4_add(ab_top, f32x4_mul(r_t, c)); + ab_bot = f32x4_add(ab_bot, f32x4_mul(r_b, c)); + } + FusedKerSpec::Store(tile) => { + // 8 rows × 1 col, write each lane to a separate row + let mut ptr: *mut u8 = tile.ptr; + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab_top); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab_top); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab_top); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab_top); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab_bot); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab_bot); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab_bot); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab_bot); + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + // A: packed [k][MR=8] = each k iter loads 8 f32 = 2 v128 + // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast + // The two fmadd ops on (ab_top, ab_bot) are independent — 2-way ILP per iter. + let a = pa as *const v128; + let b = pb as *const f32; + for i in 0..k { + let a_t = v128_load(a.offset((2 * i) as isize)); + let a_b = v128_load(a.offset((2 * i + 1) as isize)); + let b_splat = f32x4_splat(*b.offset(i as isize)); + ab_top = f32x4_add(ab_top, f32x4_mul(a_t, b_splat)); + ab_bot = f32x4_add(ab_bot, f32x4_mul(a_b, b_splat)); + } + } + } + pnl = pnl.add(1); + } + 0 + } +} + +MMMRustKernel!(kernel_f32_8x1 => wasm_f32_8x1(8,1)@(8,1) quality(ImplementationQuality::TargetOptimized)); + +/// WASM SIMD f32 16x1 kernel — wider GEMV variant for matrix-vector products +/// on very large M. Uses FOUR independent f32x4 accumulators (rows 0-3, +/// 4-7, 8-11, 12-15), enabling 4-way ILP within each k-iteration. +/// +/// Compared to wasm_f32_8x1 (2 accumulators, 2-way ILP), this exposes more +/// parallel work to the SIMD pipelines, beneficial on hardware with 3+ +/// SIMD execution units (most modern ARM and x86). +unsafe fn kernel_f32_16x1(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + unsafe { + // Four accumulators: 16 rows × 1 col packed as [ab_q0, ab_q1, ab_q2, ab_q3] + // ab_q0 = rows 0-3, ab_q1 = rows 4-7, ab_q2 = rows 8-11, ab_q3 = rows 12-15 + let mut ab_q0 = f32x4_splat(0.0); + let mut ab_q1 = f32x4_splat(0.0); + let mut ab_q2 = f32x4_splat(0.0); + let mut ab_q3 = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + let z = f32x4_splat(0.0); + ab_q0 = z; + ab_q1 = z; + ab_q2 = z; + ab_q3 = z; + } + FusedKerSpec::LoadTile(_cols, rows) => { + let p = rows as *const v128; + ab_q0 = *p; + ab_q1 = *p.add(1); + ab_q2 = *p.add(2); + ab_q3 = *p.add(3); + } + FusedKerSpec::ScalarMin(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_min(s, ab_q0); + ab_q1 = f32x4_min(s, ab_q1); + ab_q2 = f32x4_min(s, ab_q2); + ab_q3 = f32x4_min(s, ab_q3); + } + FusedKerSpec::ScalarMax(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_max(s, ab_q0); + ab_q1 = f32x4_max(s, ab_q1); + ab_q2 = f32x4_max(s, ab_q2); + ab_q3 = f32x4_max(s, ab_q3); + } + FusedKerSpec::ScalarAdd(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_add(s, ab_q0); + ab_q1 = f32x4_add(s, ab_q1); + ab_q2 = f32x4_add(s, ab_q2); + ab_q3 = f32x4_add(s, ab_q3); + } + FusedKerSpec::ScalarMul(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_mul(s, ab_q0); + ab_q1 = f32x4_mul(s, ab_q1); + ab_q2 = f32x4_mul(s, ab_q2); + ab_q3 = f32x4_mul(s, ab_q3); + } + FusedKerSpec::ScalarSub(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_sub(s, ab_q0); + ab_q1 = f32x4_sub(s, ab_q1); + ab_q2 = f32x4_sub(s, ab_q2); + ab_q3 = f32x4_sub(s, ab_q3); + } + FusedKerSpec::ScalarSubF(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_sub(ab_q0, s); + ab_q1 = f32x4_sub(ab_q1, s); + ab_q2 = f32x4_sub(ab_q2, s); + ab_q3 = f32x4_sub(ab_q3, s); + } + FusedKerSpec::LeakyRelu(a) => { + let s = f32x4_splat(a); + let zero = f32x4_splat(0.0); + let m0 = f32x4_gt(ab_q0, zero); + ab_q0 = v128_bitselect(ab_q0, f32x4_mul(s, ab_q0), m0); + let m1 = f32x4_gt(ab_q1, zero); + ab_q1 = v128_bitselect(ab_q1, f32x4_mul(s, ab_q1), m1); + let m2 = f32x4_gt(ab_q2, zero); + ab_q2 = v128_bitselect(ab_q2, f32x4_mul(s, ab_q2), m2); + let m3 = f32x4_gt(ab_q3, zero); + ab_q3 = v128_bitselect(ab_q3, f32x4_mul(s, ab_q3), m3); + } + FusedKerSpec::PerRowMin(row) => { + let p = row as *const v128; + ab_q0 = f32x4_min(v128_load(p), ab_q0); + ab_q1 = f32x4_min(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_min(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_min(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowMax(row) => { + let p = row as *const v128; + ab_q0 = f32x4_max(v128_load(p), ab_q0); + ab_q1 = f32x4_max(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_max(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_max(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowAdd(row) => { + let p = row as *const v128; + ab_q0 = f32x4_add(v128_load(p), ab_q0); + ab_q1 = f32x4_add(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_add(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_add(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowMul(row) => { + let p = row as *const v128; + ab_q0 = f32x4_mul(v128_load(p), ab_q0); + ab_q1 = f32x4_mul(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_mul(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_mul(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowSub(row) => { + let p = row as *const v128; + ab_q0 = f32x4_sub(v128_load(p), ab_q0); + ab_q1 = f32x4_sub(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_sub(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_sub(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowSubF(row) => { + let p = row as *const v128; + ab_q0 = f32x4_sub(ab_q0, v128_load(p)); + ab_q1 = f32x4_sub(ab_q1, v128_load(p.add(1))); + ab_q2 = f32x4_sub(ab_q2, v128_load(p.add(2))); + ab_q3 = f32x4_sub(ab_q3, v128_load(p.add(3))); + } + FusedKerSpec::PerColMin(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_min(c, ab_q0); + ab_q1 = f32x4_min(c, ab_q1); + ab_q2 = f32x4_min(c, ab_q2); + ab_q3 = f32x4_min(c, ab_q3); + } + FusedKerSpec::PerColMax(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_max(c, ab_q0); + ab_q1 = f32x4_max(c, ab_q1); + ab_q2 = f32x4_max(c, ab_q2); + ab_q3 = f32x4_max(c, ab_q3); + } + FusedKerSpec::PerColAdd(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_add(c, ab_q0); + ab_q1 = f32x4_add(c, ab_q1); + ab_q2 = f32x4_add(c, ab_q2); + ab_q3 = f32x4_add(c, ab_q3); + } + FusedKerSpec::PerColMul(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_mul(c, ab_q0); + ab_q1 = f32x4_mul(c, ab_q1); + ab_q2 = f32x4_mul(c, ab_q2); + ab_q3 = f32x4_mul(c, ab_q3); + } + FusedKerSpec::PerColSub(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_sub(c, ab_q0); + ab_q1 = f32x4_sub(c, ab_q1); + ab_q2 = f32x4_sub(c, ab_q2); + ab_q3 = f32x4_sub(c, ab_q3); + } + FusedKerSpec::PerColSubF(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_sub(ab_q0, c); + ab_q1 = f32x4_sub(ab_q1, c); + ab_q2 = f32x4_sub(ab_q2, c); + ab_q3 = f32x4_sub(ab_q3, c); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + let s = f32x4_splat(scaler.scale); + ab_q0 = f32x4_mul(s, ab_q0); + ab_q1 = f32x4_mul(s, ab_q1); + ab_q2 = f32x4_mul(s, ab_q2); + ab_q3 = f32x4_mul(s, ab_q3); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let s = f32x4_splat(2f32.powi(-(shift as i32))); + ab_q0 = f32x4_mul(s, ab_q0); + ab_q1 = f32x4_mul(s, ab_q1); + ab_q2 = f32x4_mul(s, ab_q2); + ab_q3 = f32x4_mul(s, ab_q3); + } + FusedKerSpec::ShiftLeft(shift) => { + let s = f32x4_splat(2f32.powi(shift as i32)); + ab_q0 = f32x4_mul(s, ab_q0); + ab_q1 = f32x4_mul(s, ab_q1); + ab_q2 = f32x4_mul(s, ab_q2); + ab_q3 = f32x4_mul(s, ab_q3); + } + FusedKerSpec::AddUnicast(tile) => { + // 16 rows × 1 col, with row_byte_stride between rows + let mut ptr: *const u8 = tile.ptr; + let mut ms = [0f32; 16]; + for i in 0..16 { + ms[i] = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + } + ab_q0 = f32x4_add(ab_q0, f32x4(ms[0], ms[1], ms[2], ms[3])); + ab_q1 = f32x4_add(ab_q1, f32x4(ms[4], ms[5], ms[6], ms[7])); + ab_q2 = f32x4_add(ab_q2, f32x4(ms[8], ms[9], ms[10], ms[11])); + ab_q3 = f32x4_add(ab_q3, f32x4(ms[12], ms[13], ms[14], ms[15])); + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + let p = rows as *const v128; + let c = f32x4_splat(*cols); + ab_q0 = f32x4_add(ab_q0, f32x4_mul(v128_load(p), c)); + ab_q1 = f32x4_add(ab_q1, f32x4_mul(v128_load(p.add(1)), c)); + ab_q2 = f32x4_add(ab_q2, f32x4_mul(v128_load(p.add(2)), c)); + ab_q3 = f32x4_add(ab_q3, f32x4_mul(v128_load(p.add(3)), c)); + } + FusedKerSpec::Store(tile) => { + // 16 rows × 1 col, write each lane to a separate row + let mut ptr: *mut u8 = tile.ptr; + for ab in [ab_q0, ab_q1, ab_q2, ab_q3].iter() { + *(ptr as *mut f32) = f32x4_extract_lane::<0>(*ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<1>(*ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<2>(*ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<3>(*ab); + ptr = ptr.add(tile.row_byte_stride as usize); + } + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + // A: packed [k][MR=16] = each k iter loads 16 f32 = 4 v128 + // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast + // 4 INDEPENDENT fmadds per k-iter — 4-way ILP + let a = pa as *const v128; + let b = pb as *const f32; + for i in 0..k { + let a0 = v128_load(a.offset((4 * i) as isize)); + let a1 = v128_load(a.offset((4 * i + 1) as isize)); + let a2 = v128_load(a.offset((4 * i + 2) as isize)); + let a3 = v128_load(a.offset((4 * i + 3) as isize)); + let bs = f32x4_splat(*b.offset(i as isize)); + ab_q0 = f32x4_add(ab_q0, f32x4_mul(a0, bs)); + ab_q1 = f32x4_add(ab_q1, f32x4_mul(a1, bs)); + ab_q2 = f32x4_add(ab_q2, f32x4_mul(a2, bs)); + ab_q3 = f32x4_add(ab_q3, f32x4_mul(a3, bs)); + } + } + } + pnl = pnl.add(1); + } + 0 + } +} + +MMMRustKernel!(kernel_f32_16x1 => wasm_f32_16x1(16,1)@(16,1) quality(ImplementationQuality::TargetOptimized)); + +/// WASM SIMD f32 8x8 kernel — wide MM tile (8 rows × 8 cols, 16 v128 accumulators). +/// Each row uses 2 v128: cols 0-3 in `_lo`, cols 4-7 in `_hi`. 16 accumulators +/// is at the limit of WASM's 16 logical SIMD register slots; this tests the +/// register-pressure boundary. For DFN3 ops, all M and N are multiples of 8, +/// so 8x8 fits cleanly with no padding waste. +unsafe fn kernel_f32_8x8(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + unsafe { + // 8 rows × 8 cols = 16 f32x4 accumulators (cols 0-3 in _lo, cols 4-7 in _hi) + let mut a0lo = f32x4_splat(0.0); + let mut a0hi = f32x4_splat(0.0); + let mut a1lo = f32x4_splat(0.0); + let mut a1hi = f32x4_splat(0.0); + let mut a2lo = f32x4_splat(0.0); + let mut a2hi = f32x4_splat(0.0); + let mut a3lo = f32x4_splat(0.0); + let mut a3hi = f32x4_splat(0.0); + let mut a4lo = f32x4_splat(0.0); + let mut a4hi = f32x4_splat(0.0); + let mut a5lo = f32x4_splat(0.0); + let mut a5hi = f32x4_splat(0.0); + let mut a6lo = f32x4_splat(0.0); + let mut a6hi = f32x4_splat(0.0); + let mut a7lo = f32x4_splat(0.0); + let mut a7hi = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + let z = f32x4_splat(0.0); + a0lo = z; + a0hi = z; + a1lo = z; + a1hi = z; + a2lo = z; + a2hi = z; + a3lo = z; + a3hi = z; + a4lo = z; + a4hi = z; + a5lo = z; + a5hi = z; + a6lo = z; + a6hi = z; + a7lo = z; + a7hi = z; + } + FusedKerSpec::LoadTile(_cols, rows) => { + // 8 rows × 8 cols = 16 v128 (2 per row, contiguous lo+hi) + let p = rows as *const v128; + a0lo = *p.add(0); + a0hi = *p.add(1); + a1lo = *p.add(2); + a1hi = *p.add(3); + a2lo = *p.add(4); + a2hi = *p.add(5); + a3lo = *p.add(6); + a3hi = *p.add(7); + a4lo = *p.add(8); + a4hi = *p.add(9); + a5lo = *p.add(10); + a5hi = *p.add(11); + a6lo = *p.add(12); + a6hi = *p.add(13); + a7lo = *p.add(14); + a7hi = *p.add(15); + } + FusedKerSpec::ScalarMin(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_min(s, a0lo); + a0hi = f32x4_min(s, a0hi); + a1lo = f32x4_min(s, a1lo); + a1hi = f32x4_min(s, a1hi); + a2lo = f32x4_min(s, a2lo); + a2hi = f32x4_min(s, a2hi); + a3lo = f32x4_min(s, a3lo); + a3hi = f32x4_min(s, a3hi); + a4lo = f32x4_min(s, a4lo); + a4hi = f32x4_min(s, a4hi); + a5lo = f32x4_min(s, a5lo); + a5hi = f32x4_min(s, a5hi); + a6lo = f32x4_min(s, a6lo); + a6hi = f32x4_min(s, a6hi); + a7lo = f32x4_min(s, a7lo); + a7hi = f32x4_min(s, a7hi); + } + FusedKerSpec::ScalarMax(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_max(s, a0lo); + a0hi = f32x4_max(s, a0hi); + a1lo = f32x4_max(s, a1lo); + a1hi = f32x4_max(s, a1hi); + a2lo = f32x4_max(s, a2lo); + a2hi = f32x4_max(s, a2hi); + a3lo = f32x4_max(s, a3lo); + a3hi = f32x4_max(s, a3hi); + a4lo = f32x4_max(s, a4lo); + a4hi = f32x4_max(s, a4hi); + a5lo = f32x4_max(s, a5lo); + a5hi = f32x4_max(s, a5hi); + a6lo = f32x4_max(s, a6lo); + a6hi = f32x4_max(s, a6hi); + a7lo = f32x4_max(s, a7lo); + a7hi = f32x4_max(s, a7hi); + } + FusedKerSpec::ScalarAdd(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_add(s, a0lo); + a0hi = f32x4_add(s, a0hi); + a1lo = f32x4_add(s, a1lo); + a1hi = f32x4_add(s, a1hi); + a2lo = f32x4_add(s, a2lo); + a2hi = f32x4_add(s, a2hi); + a3lo = f32x4_add(s, a3lo); + a3hi = f32x4_add(s, a3hi); + a4lo = f32x4_add(s, a4lo); + a4hi = f32x4_add(s, a4hi); + a5lo = f32x4_add(s, a5lo); + a5hi = f32x4_add(s, a5hi); + a6lo = f32x4_add(s, a6lo); + a6hi = f32x4_add(s, a6hi); + a7lo = f32x4_add(s, a7lo); + a7hi = f32x4_add(s, a7hi); + } + FusedKerSpec::ScalarMul(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_mul(s, a0lo); + a0hi = f32x4_mul(s, a0hi); + a1lo = f32x4_mul(s, a1lo); + a1hi = f32x4_mul(s, a1hi); + a2lo = f32x4_mul(s, a2lo); + a2hi = f32x4_mul(s, a2hi); + a3lo = f32x4_mul(s, a3lo); + a3hi = f32x4_mul(s, a3hi); + a4lo = f32x4_mul(s, a4lo); + a4hi = f32x4_mul(s, a4hi); + a5lo = f32x4_mul(s, a5lo); + a5hi = f32x4_mul(s, a5hi); + a6lo = f32x4_mul(s, a6lo); + a6hi = f32x4_mul(s, a6hi); + a7lo = f32x4_mul(s, a7lo); + a7hi = f32x4_mul(s, a7hi); + } + FusedKerSpec::ScalarSub(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_sub(s, a0lo); + a0hi = f32x4_sub(s, a0hi); + a1lo = f32x4_sub(s, a1lo); + a1hi = f32x4_sub(s, a1hi); + a2lo = f32x4_sub(s, a2lo); + a2hi = f32x4_sub(s, a2hi); + a3lo = f32x4_sub(s, a3lo); + a3hi = f32x4_sub(s, a3hi); + a4lo = f32x4_sub(s, a4lo); + a4hi = f32x4_sub(s, a4hi); + a5lo = f32x4_sub(s, a5lo); + a5hi = f32x4_sub(s, a5hi); + a6lo = f32x4_sub(s, a6lo); + a6hi = f32x4_sub(s, a6hi); + a7lo = f32x4_sub(s, a7lo); + a7hi = f32x4_sub(s, a7hi); + } + FusedKerSpec::ScalarSubF(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_sub(a0lo, s); + a0hi = f32x4_sub(a0hi, s); + a1lo = f32x4_sub(a1lo, s); + a1hi = f32x4_sub(a1hi, s); + a2lo = f32x4_sub(a2lo, s); + a2hi = f32x4_sub(a2hi, s); + a3lo = f32x4_sub(a3lo, s); + a3hi = f32x4_sub(a3hi, s); + a4lo = f32x4_sub(a4lo, s); + a4hi = f32x4_sub(a4hi, s); + a5lo = f32x4_sub(a5lo, s); + a5hi = f32x4_sub(a5hi, s); + a6lo = f32x4_sub(a6lo, s); + a6hi = f32x4_sub(a6hi, s); + a7lo = f32x4_sub(a7lo, s); + a7hi = f32x4_sub(a7hi, s); + } + FusedKerSpec::LeakyRelu(a) => { + let s = f32x4_splat(a); + let zero = f32x4_splat(0.0); + let m0a = f32x4_gt(a0lo, zero); + a0lo = v128_bitselect(a0lo, f32x4_mul(s, a0lo), m0a); + let m0b = f32x4_gt(a0hi, zero); + a0hi = v128_bitselect(a0hi, f32x4_mul(s, a0hi), m0b); + let m1a = f32x4_gt(a1lo, zero); + a1lo = v128_bitselect(a1lo, f32x4_mul(s, a1lo), m1a); + let m1b = f32x4_gt(a1hi, zero); + a1hi = v128_bitselect(a1hi, f32x4_mul(s, a1hi), m1b); + let m2a = f32x4_gt(a2lo, zero); + a2lo = v128_bitselect(a2lo, f32x4_mul(s, a2lo), m2a); + let m2b = f32x4_gt(a2hi, zero); + a2hi = v128_bitselect(a2hi, f32x4_mul(s, a2hi), m2b); + let m3a = f32x4_gt(a3lo, zero); + a3lo = v128_bitselect(a3lo, f32x4_mul(s, a3lo), m3a); + let m3b = f32x4_gt(a3hi, zero); + a3hi = v128_bitselect(a3hi, f32x4_mul(s, a3hi), m3b); + let m4a = f32x4_gt(a4lo, zero); + a4lo = v128_bitselect(a4lo, f32x4_mul(s, a4lo), m4a); + let m4b = f32x4_gt(a4hi, zero); + a4hi = v128_bitselect(a4hi, f32x4_mul(s, a4hi), m4b); + let m5a = f32x4_gt(a5lo, zero); + a5lo = v128_bitselect(a5lo, f32x4_mul(s, a5lo), m5a); + let m5b = f32x4_gt(a5hi, zero); + a5hi = v128_bitselect(a5hi, f32x4_mul(s, a5hi), m5b); + let m6a = f32x4_gt(a6lo, zero); + a6lo = v128_bitselect(a6lo, f32x4_mul(s, a6lo), m6a); + let m6b = f32x4_gt(a6hi, zero); + a6hi = v128_bitselect(a6hi, f32x4_mul(s, a6hi), m6b); + let m7a = f32x4_gt(a7lo, zero); + a7lo = v128_bitselect(a7lo, f32x4_mul(s, a7lo), m7a); + let m7b = f32x4_gt(a7hi, zero); + a7hi = v128_bitselect(a7hi, f32x4_mul(s, a7hi), m7b); + } + FusedKerSpec::PerRowMin(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); + a0lo = f32x4_min(r0, a0lo); + a0hi = f32x4_min(r0, a0hi); + let r1 = f32x4_splat(r[1]); + a1lo = f32x4_min(r1, a1lo); + a1hi = f32x4_min(r1, a1hi); + let r2 = f32x4_splat(r[2]); + a2lo = f32x4_min(r2, a2lo); + a2hi = f32x4_min(r2, a2hi); + let r3 = f32x4_splat(r[3]); + a3lo = f32x4_min(r3, a3lo); + a3hi = f32x4_min(r3, a3hi); + let r4 = f32x4_splat(r[4]); + a4lo = f32x4_min(r4, a4lo); + a4hi = f32x4_min(r4, a4hi); + let r5 = f32x4_splat(r[5]); + a5lo = f32x4_min(r5, a5lo); + a5hi = f32x4_min(r5, a5hi); + let r6 = f32x4_splat(r[6]); + a6lo = f32x4_min(r6, a6lo); + a6hi = f32x4_min(r6, a6hi); + let r7 = f32x4_splat(r[7]); + a7lo = f32x4_min(r7, a7lo); + a7hi = f32x4_min(r7, a7hi); + } + FusedKerSpec::PerRowMax(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); + a0lo = f32x4_max(r0, a0lo); + a0hi = f32x4_max(r0, a0hi); + let r1 = f32x4_splat(r[1]); + a1lo = f32x4_max(r1, a1lo); + a1hi = f32x4_max(r1, a1hi); + let r2 = f32x4_splat(r[2]); + a2lo = f32x4_max(r2, a2lo); + a2hi = f32x4_max(r2, a2hi); + let r3 = f32x4_splat(r[3]); + a3lo = f32x4_max(r3, a3lo); + a3hi = f32x4_max(r3, a3hi); + let r4 = f32x4_splat(r[4]); + a4lo = f32x4_max(r4, a4lo); + a4hi = f32x4_max(r4, a4hi); + let r5 = f32x4_splat(r[5]); + a5lo = f32x4_max(r5, a5lo); + a5hi = f32x4_max(r5, a5hi); + let r6 = f32x4_splat(r[6]); + a6lo = f32x4_max(r6, a6lo); + a6hi = f32x4_max(r6, a6hi); + let r7 = f32x4_splat(r[7]); + a7lo = f32x4_max(r7, a7lo); + a7hi = f32x4_max(r7, a7hi); + } + FusedKerSpec::PerRowAdd(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); + a0lo = f32x4_add(r0, a0lo); + a0hi = f32x4_add(r0, a0hi); + let r1 = f32x4_splat(r[1]); + a1lo = f32x4_add(r1, a1lo); + a1hi = f32x4_add(r1, a1hi); + let r2 = f32x4_splat(r[2]); + a2lo = f32x4_add(r2, a2lo); + a2hi = f32x4_add(r2, a2hi); + let r3 = f32x4_splat(r[3]); + a3lo = f32x4_add(r3, a3lo); + a3hi = f32x4_add(r3, a3hi); + let r4 = f32x4_splat(r[4]); + a4lo = f32x4_add(r4, a4lo); + a4hi = f32x4_add(r4, a4hi); + let r5 = f32x4_splat(r[5]); + a5lo = f32x4_add(r5, a5lo); + a5hi = f32x4_add(r5, a5hi); + let r6 = f32x4_splat(r[6]); + a6lo = f32x4_add(r6, a6lo); + a6hi = f32x4_add(r6, a6hi); + let r7 = f32x4_splat(r[7]); + a7lo = f32x4_add(r7, a7lo); + a7hi = f32x4_add(r7, a7hi); + } + FusedKerSpec::PerRowMul(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); + a0lo = f32x4_mul(r0, a0lo); + a0hi = f32x4_mul(r0, a0hi); + let r1 = f32x4_splat(r[1]); + a1lo = f32x4_mul(r1, a1lo); + a1hi = f32x4_mul(r1, a1hi); + let r2 = f32x4_splat(r[2]); + a2lo = f32x4_mul(r2, a2lo); + a2hi = f32x4_mul(r2, a2hi); + let r3 = f32x4_splat(r[3]); + a3lo = f32x4_mul(r3, a3lo); + a3hi = f32x4_mul(r3, a3hi); + let r4 = f32x4_splat(r[4]); + a4lo = f32x4_mul(r4, a4lo); + a4hi = f32x4_mul(r4, a4hi); + let r5 = f32x4_splat(r[5]); + a5lo = f32x4_mul(r5, a5lo); + a5hi = f32x4_mul(r5, a5hi); + let r6 = f32x4_splat(r[6]); + a6lo = f32x4_mul(r6, a6lo); + a6hi = f32x4_mul(r6, a6hi); + let r7 = f32x4_splat(r[7]); + a7lo = f32x4_mul(r7, a7lo); + a7hi = f32x4_mul(r7, a7hi); + } + FusedKerSpec::PerRowSub(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); + a0lo = f32x4_sub(r0, a0lo); + a0hi = f32x4_sub(r0, a0hi); + let r1 = f32x4_splat(r[1]); + a1lo = f32x4_sub(r1, a1lo); + a1hi = f32x4_sub(r1, a1hi); + let r2 = f32x4_splat(r[2]); + a2lo = f32x4_sub(r2, a2lo); + a2hi = f32x4_sub(r2, a2hi); + let r3 = f32x4_splat(r[3]); + a3lo = f32x4_sub(r3, a3lo); + a3hi = f32x4_sub(r3, a3hi); + let r4 = f32x4_splat(r[4]); + a4lo = f32x4_sub(r4, a4lo); + a4hi = f32x4_sub(r4, a4hi); + let r5 = f32x4_splat(r[5]); + a5lo = f32x4_sub(r5, a5lo); + a5hi = f32x4_sub(r5, a5hi); + let r6 = f32x4_splat(r[6]); + a6lo = f32x4_sub(r6, a6lo); + a6hi = f32x4_sub(r6, a6hi); + let r7 = f32x4_splat(r[7]); + a7lo = f32x4_sub(r7, a7lo); + a7hi = f32x4_sub(r7, a7hi); + } + FusedKerSpec::PerRowSubF(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); + a0lo = f32x4_sub(a0lo, r0); + a0hi = f32x4_sub(a0hi, r0); + let r1 = f32x4_splat(r[1]); + a1lo = f32x4_sub(a1lo, r1); + a1hi = f32x4_sub(a1hi, r1); + let r2 = f32x4_splat(r[2]); + a2lo = f32x4_sub(a2lo, r2); + a2hi = f32x4_sub(a2hi, r2); + let r3 = f32x4_splat(r[3]); + a3lo = f32x4_sub(a3lo, r3); + a3hi = f32x4_sub(a3hi, r3); + let r4 = f32x4_splat(r[4]); + a4lo = f32x4_sub(a4lo, r4); + a4hi = f32x4_sub(a4hi, r4); + let r5 = f32x4_splat(r[5]); + a5lo = f32x4_sub(a5lo, r5); + a5hi = f32x4_sub(a5hi, r5); + let r6 = f32x4_splat(r[6]); + a6lo = f32x4_sub(a6lo, r6); + a6hi = f32x4_sub(a6hi, r6); + let r7 = f32x4_splat(r[7]); + a7lo = f32x4_sub(a7lo, r7); + a7hi = f32x4_sub(a7hi, r7); + } + FusedKerSpec::PerColMin(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); + let chi = v128_load(p.add(1)); + a0lo = f32x4_min(clo, a0lo); + a0hi = f32x4_min(chi, a0hi); + a1lo = f32x4_min(clo, a1lo); + a1hi = f32x4_min(chi, a1hi); + a2lo = f32x4_min(clo, a2lo); + a2hi = f32x4_min(chi, a2hi); + a3lo = f32x4_min(clo, a3lo); + a3hi = f32x4_min(chi, a3hi); + a4lo = f32x4_min(clo, a4lo); + a4hi = f32x4_min(chi, a4hi); + a5lo = f32x4_min(clo, a5lo); + a5hi = f32x4_min(chi, a5hi); + a6lo = f32x4_min(clo, a6lo); + a6hi = f32x4_min(chi, a6hi); + a7lo = f32x4_min(clo, a7lo); + a7hi = f32x4_min(chi, a7hi); + } + FusedKerSpec::PerColMax(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); + let chi = v128_load(p.add(1)); + a0lo = f32x4_max(clo, a0lo); + a0hi = f32x4_max(chi, a0hi); + a1lo = f32x4_max(clo, a1lo); + a1hi = f32x4_max(chi, a1hi); + a2lo = f32x4_max(clo, a2lo); + a2hi = f32x4_max(chi, a2hi); + a3lo = f32x4_max(clo, a3lo); + a3hi = f32x4_max(chi, a3hi); + a4lo = f32x4_max(clo, a4lo); + a4hi = f32x4_max(chi, a4hi); + a5lo = f32x4_max(clo, a5lo); + a5hi = f32x4_max(chi, a5hi); + a6lo = f32x4_max(clo, a6lo); + a6hi = f32x4_max(chi, a6hi); + a7lo = f32x4_max(clo, a7lo); + a7hi = f32x4_max(chi, a7hi); + } + FusedKerSpec::PerColAdd(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); + let chi = v128_load(p.add(1)); + a0lo = f32x4_add(clo, a0lo); + a0hi = f32x4_add(chi, a0hi); + a1lo = f32x4_add(clo, a1lo); + a1hi = f32x4_add(chi, a1hi); + a2lo = f32x4_add(clo, a2lo); + a2hi = f32x4_add(chi, a2hi); + a3lo = f32x4_add(clo, a3lo); + a3hi = f32x4_add(chi, a3hi); + a4lo = f32x4_add(clo, a4lo); + a4hi = f32x4_add(chi, a4hi); + a5lo = f32x4_add(clo, a5lo); + a5hi = f32x4_add(chi, a5hi); + a6lo = f32x4_add(clo, a6lo); + a6hi = f32x4_add(chi, a6hi); + a7lo = f32x4_add(clo, a7lo); + a7hi = f32x4_add(chi, a7hi); + } + FusedKerSpec::PerColMul(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); + let chi = v128_load(p.add(1)); + a0lo = f32x4_mul(clo, a0lo); + a0hi = f32x4_mul(chi, a0hi); + a1lo = f32x4_mul(clo, a1lo); + a1hi = f32x4_mul(chi, a1hi); + a2lo = f32x4_mul(clo, a2lo); + a2hi = f32x4_mul(chi, a2hi); + a3lo = f32x4_mul(clo, a3lo); + a3hi = f32x4_mul(chi, a3hi); + a4lo = f32x4_mul(clo, a4lo); + a4hi = f32x4_mul(chi, a4hi); + a5lo = f32x4_mul(clo, a5lo); + a5hi = f32x4_mul(chi, a5hi); + a6lo = f32x4_mul(clo, a6lo); + a6hi = f32x4_mul(chi, a6hi); + a7lo = f32x4_mul(clo, a7lo); + a7hi = f32x4_mul(chi, a7hi); + } + FusedKerSpec::PerColSub(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); + let chi = v128_load(p.add(1)); + a0lo = f32x4_sub(clo, a0lo); + a0hi = f32x4_sub(chi, a0hi); + a1lo = f32x4_sub(clo, a1lo); + a1hi = f32x4_sub(chi, a1hi); + a2lo = f32x4_sub(clo, a2lo); + a2hi = f32x4_sub(chi, a2hi); + a3lo = f32x4_sub(clo, a3lo); + a3hi = f32x4_sub(chi, a3hi); + a4lo = f32x4_sub(clo, a4lo); + a4hi = f32x4_sub(chi, a4hi); + a5lo = f32x4_sub(clo, a5lo); + a5hi = f32x4_sub(chi, a5hi); + a6lo = f32x4_sub(clo, a6lo); + a6hi = f32x4_sub(chi, a6hi); + a7lo = f32x4_sub(clo, a7lo); + a7hi = f32x4_sub(chi, a7hi); + } + FusedKerSpec::PerColSubF(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); + let chi = v128_load(p.add(1)); + a0lo = f32x4_sub(a0lo, clo); + a0hi = f32x4_sub(a0hi, chi); + a1lo = f32x4_sub(a1lo, clo); + a1hi = f32x4_sub(a1hi, chi); + a2lo = f32x4_sub(a2lo, clo); + a2hi = f32x4_sub(a2hi, chi); + a3lo = f32x4_sub(a3lo, clo); + a3hi = f32x4_sub(a3hi, chi); + a4lo = f32x4_sub(a4lo, clo); + a4hi = f32x4_sub(a4hi, chi); + a5lo = f32x4_sub(a5lo, clo); + a5hi = f32x4_sub(a5hi, chi); + a6lo = f32x4_sub(a6lo, clo); + a6hi = f32x4_sub(a6hi, chi); + a7lo = f32x4_sub(a7lo, clo); + a7hi = f32x4_sub(a7hi, chi); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + let s = f32x4_splat(scaler.scale); + a0lo = f32x4_mul(s, a0lo); + a0hi = f32x4_mul(s, a0hi); + a1lo = f32x4_mul(s, a1lo); + a1hi = f32x4_mul(s, a1hi); + a2lo = f32x4_mul(s, a2lo); + a2hi = f32x4_mul(s, a2hi); + a3lo = f32x4_mul(s, a3lo); + a3hi = f32x4_mul(s, a3hi); + a4lo = f32x4_mul(s, a4lo); + a4hi = f32x4_mul(s, a4hi); + a5lo = f32x4_mul(s, a5lo); + a5hi = f32x4_mul(s, a5hi); + a6lo = f32x4_mul(s, a6lo); + a6hi = f32x4_mul(s, a6hi); + a7lo = f32x4_mul(s, a7lo); + a7hi = f32x4_mul(s, a7hi); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let s = f32x4_splat(2f32.powi(-(shift as i32))); + a0lo = f32x4_mul(s, a0lo); + a0hi = f32x4_mul(s, a0hi); + a1lo = f32x4_mul(s, a1lo); + a1hi = f32x4_mul(s, a1hi); + a2lo = f32x4_mul(s, a2lo); + a2hi = f32x4_mul(s, a2hi); + a3lo = f32x4_mul(s, a3lo); + a3hi = f32x4_mul(s, a3hi); + a4lo = f32x4_mul(s, a4lo); + a4hi = f32x4_mul(s, a4hi); + a5lo = f32x4_mul(s, a5lo); + a5hi = f32x4_mul(s, a5hi); + a6lo = f32x4_mul(s, a6lo); + a6hi = f32x4_mul(s, a6hi); + a7lo = f32x4_mul(s, a7lo); + a7hi = f32x4_mul(s, a7hi); + } + FusedKerSpec::ShiftLeft(shift) => { + let s = f32x4_splat(2f32.powi(shift as i32)); + a0lo = f32x4_mul(s, a0lo); + a0hi = f32x4_mul(s, a0hi); + a1lo = f32x4_mul(s, a1lo); + a1hi = f32x4_mul(s, a1hi); + a2lo = f32x4_mul(s, a2lo); + a2hi = f32x4_mul(s, a2hi); + a3lo = f32x4_mul(s, a3lo); + a3hi = f32x4_mul(s, a3hi); + a4lo = f32x4_mul(s, a4lo); + a4hi = f32x4_mul(s, a4hi); + a5lo = f32x4_mul(s, a5lo); + a5hi = f32x4_mul(s, a5hi); + a6lo = f32x4_mul(s, a6lo); + a6hi = f32x4_mul(s, a6hi); + a7lo = f32x4_mul(s, a7lo); + a7hi = f32x4_mul(s, a7hi); + } + FusedKerSpec::AddUnicast(tile) => { + // 8 rows × 8 cols, each row laid out per col_byte_stride + let mut ptr: *const u8 = tile.ptr; + for ab_pair in [ + (&mut a0lo, &mut a0hi), + (&mut a1lo, &mut a1hi), + (&mut a2lo, &mut a2hi), + (&mut a3lo, &mut a3hi), + (&mut a4lo, &mut a4hi), + (&mut a5lo, &mut a5hi), + (&mut a6lo, &mut a6hi), + (&mut a7lo, &mut a7hi), + ] + .iter_mut() + { + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + let m4 = *(ptr.offset(tile.col_byte_stride * 4) as *const f32); + let m5 = *(ptr.offset(tile.col_byte_stride * 5) as *const f32); + let m6 = *(ptr.offset(tile.col_byte_stride * 6) as *const f32); + let m7 = *(ptr.offset(tile.col_byte_stride * 7) as *const f32); + let (lo, hi) = ab_pair; + **lo = f32x4_add(**lo, f32x4(m0, m1, m2, m3)); + **hi = f32x4_add(**hi, f32x4(m4, m5, m6, m7)); + ptr = ptr.add(tile.row_byte_stride as usize); + } + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + let p = cols as *const v128; + let clo = v128_load(p); + let chi = v128_load(p.add(1)); + let r0 = f32x4_splat(*rows.add(0)); + a0lo = f32x4_add(a0lo, f32x4_mul(r0, clo)); + a0hi = f32x4_add(a0hi, f32x4_mul(r0, chi)); + let r1 = f32x4_splat(*rows.add(1)); + a1lo = f32x4_add(a1lo, f32x4_mul(r1, clo)); + a1hi = f32x4_add(a1hi, f32x4_mul(r1, chi)); + let r2 = f32x4_splat(*rows.add(2)); + a2lo = f32x4_add(a2lo, f32x4_mul(r2, clo)); + a2hi = f32x4_add(a2hi, f32x4_mul(r2, chi)); + let r3 = f32x4_splat(*rows.add(3)); + a3lo = f32x4_add(a3lo, f32x4_mul(r3, clo)); + a3hi = f32x4_add(a3hi, f32x4_mul(r3, chi)); + let r4 = f32x4_splat(*rows.add(4)); + a4lo = f32x4_add(a4lo, f32x4_mul(r4, clo)); + a4hi = f32x4_add(a4hi, f32x4_mul(r4, chi)); + let r5 = f32x4_splat(*rows.add(5)); + a5lo = f32x4_add(a5lo, f32x4_mul(r5, clo)); + a5hi = f32x4_add(a5hi, f32x4_mul(r5, chi)); + let r6 = f32x4_splat(*rows.add(6)); + a6lo = f32x4_add(a6lo, f32x4_mul(r6, clo)); + a6hi = f32x4_add(a6hi, f32x4_mul(r6, chi)); + let r7 = f32x4_splat(*rows.add(7)); + a7lo = f32x4_add(a7lo, f32x4_mul(r7, clo)); + a7hi = f32x4_add(a7hi, f32x4_mul(r7, chi)); + } + FusedKerSpec::Store(tile) => { + // 8 rows × 8 cols stores + let mut ptr: *mut u8 = tile.ptr; + for (lo, hi) in [ + (a0lo, a0hi), + (a1lo, a1hi), + (a2lo, a2hi), + (a3lo, a3hi), + (a4lo, a4hi), + (a5lo, a5hi), + (a6lo, a6hi), + (a7lo, a7hi), + ] + .iter() + { + *(ptr as *mut f32) = f32x4_extract_lane::<0>(*lo); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = + f32x4_extract_lane::<1>(*lo); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = + f32x4_extract_lane::<2>(*lo); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = + f32x4_extract_lane::<3>(*lo); + *(ptr.offset(tile.col_byte_stride * 4) as *mut f32) = + f32x4_extract_lane::<0>(*hi); + *(ptr.offset(tile.col_byte_stride * 5) as *mut f32) = + f32x4_extract_lane::<1>(*hi); + *(ptr.offset(tile.col_byte_stride * 6) as *mut f32) = + f32x4_extract_lane::<2>(*hi); + *(ptr.offset(tile.col_byte_stride * 7) as *mut f32) = + f32x4_extract_lane::<3>(*hi); + ptr = ptr.add(tile.row_byte_stride as usize); + } + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + // A: packed [k][MR=8] = each k iter loads 8 row values + // B: packed [k][NR=8] = each k iter loads 8 col values as 2 v128 + let a = pa as *const f32; + let b = pb as *const v128; + for i in 0..k { + let arow = std::slice::from_raw_parts(a.offset(8 * i as isize), 8); + let blo = v128_load(b.offset((2 * i) as isize)); + let bhi = v128_load(b.offset((2 * i + 1) as isize)); + let s = f32x4_splat(arow[0]); + a0lo = f32x4_add(a0lo, f32x4_mul(s, blo)); + a0hi = f32x4_add(a0hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[1]); + a1lo = f32x4_add(a1lo, f32x4_mul(s, blo)); + a1hi = f32x4_add(a1hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[2]); + a2lo = f32x4_add(a2lo, f32x4_mul(s, blo)); + a2hi = f32x4_add(a2hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[3]); + a3lo = f32x4_add(a3lo, f32x4_mul(s, blo)); + a3hi = f32x4_add(a3hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[4]); + a4lo = f32x4_add(a4lo, f32x4_mul(s, blo)); + a4hi = f32x4_add(a4hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[5]); + a5lo = f32x4_add(a5lo, f32x4_mul(s, blo)); + a5hi = f32x4_add(a5hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[6]); + a6lo = f32x4_add(a6lo, f32x4_mul(s, blo)); + a6hi = f32x4_add(a6hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[7]); + a7lo = f32x4_add(a7lo, f32x4_mul(s, blo)); + a7hi = f32x4_add(a7hi, f32x4_mul(s, bhi)); + } + } + } + pnl = pnl.add(1); + } + 0 + } +} + +MMMRustKernel!(kernel_f32_8x8 => wasm_f32_8x8(8,8)@(8,8) quality(ImplementationQuality::TargetOptimized)); diff --git a/vendor/tract-linalg-0.22.1/src/wasm.rs.before-fma b/vendor/tract-linalg-0.22.1/src/wasm.rs.before-fma new file mode 100644 index 000000000..628fc720c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/wasm.rs.before-fma @@ -0,0 +1,1664 @@ +/// Wasm SIMD implementation of `MatMatMulKer` +/// +/// To run test, you need to install `wasmtime` +/// and export the following environment variables: +/// ``` +/// > export RUSTFLAGS='-C target-feature=+simd128' +/// > export CARGO_TARGET_WASM32_WASI_RUNNER=wasmtime +/// > cargo test --target=wasm32-wasi +/// ``` +use crate::mmm::FusedKerSpec; +use crate::mmm::ImplementationQuality; +use crate::{Ops, Scaler}; + +pub fn plug(ops: &mut Ops) { + ops.mmm_impls.push(wasm_f32_4x4.mmm()); + ops.mmm_impls.push(wasm_f32_4x1.mmm()); + ops.mmm_impls.push(wasm_f32_8x1.mmm()); + ops.mmm_impls.push(wasm_f32_16x1.mmm()); + ops.mmm_impls.push(wasm_f32_8x8.mmm()); + // Selection: max(nr*mr) for N>1, max(mr) for N=1. + // - N>1 ops: 8x8 (nr*mr=64) wins over 4x4 (16) + // - N=1 ops: 16x1 (mr=16) wins + ops.mmm_f32 = Box::new(|_m, _k, _n| wasm_f32_8x8.mmm()); + ops.mmv_f32 = Box::new(|m, _k| match m.unwrap_or(0) { + 0..=7 => wasm_f32_4x1.mmm(), + 8..=15 => wasm_f32_8x1.mmm(), + _ => wasm_f32_16x1.mmm(), + }); +} + +unsafe fn kernel_f32_4x4(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + unsafe { + // Each of these variables stores a row of the matrix, + // consisting of four packed `f32` numbers. + let mut ab0 = f32x4_splat(0.0); + let mut ab1 = f32x4_splat(0.0); + let mut ab2 = f32x4_splat(0.0); + let mut ab3 = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + let a = f32x4_splat(0.0); + ab0 = a; + ab1 = a; + ab2 = a; + ab3 = a; + } + FusedKerSpec::LoadTile(_cols, rows) => { + let rows = rows as *const v128; + ab0 = *rows; + ab1 = *rows.add(1); + ab2 = *rows.add(2); + ab3 = *rows.add(3); + } + FusedKerSpec::ScalarMin(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_min(a, ab0); + ab1 = f32x4_min(a, ab1); + ab2 = f32x4_min(a, ab2); + ab3 = f32x4_min(a, ab3); + } + FusedKerSpec::ScalarMax(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_max(a, ab0); + ab1 = f32x4_max(a, ab1); + ab2 = f32x4_max(a, ab2); + ab3 = f32x4_max(a, ab3); + } + FusedKerSpec::ScalarAdd(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_add(a, ab0); + ab1 = f32x4_add(a, ab1); + ab2 = f32x4_add(a, ab2); + ab3 = f32x4_add(a, ab3); + } + FusedKerSpec::ScalarMul(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_mul(a, ab0); + ab1 = f32x4_mul(a, ab1); + ab2 = f32x4_mul(a, ab2); + ab3 = f32x4_mul(a, ab3); + } + FusedKerSpec::ScalarSub(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_sub(a, ab0); + ab1 = f32x4_sub(a, ab1); + ab2 = f32x4_sub(a, ab2); + ab3 = f32x4_sub(a, ab3); + } + FusedKerSpec::ScalarSubF(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_sub(ab0, a); + ab1 = f32x4_sub(ab1, a); + ab2 = f32x4_sub(ab2, a); + ab3 = f32x4_sub(ab3, a); + } + FusedKerSpec::LeakyRelu(a) => { + let a = f32x4_splat(a); + let zero = f32x4_splat(0.0); + + let mask0 = f32x4_gt(ab0, zero); + ab0 = v128_bitselect(ab0, f32x4_mul(a, ab0), mask0); + + let mask1 = f32x4_gt(ab1, zero); + ab1 = v128_bitselect(ab1, f32x4_mul(a, ab1), mask1); + + let mask2 = f32x4_gt(ab2, zero); + ab2 = v128_bitselect(ab2, f32x4_mul(a, ab2), mask2); + + let mask3 = f32x4_gt(ab3, zero); + ab3 = v128_bitselect(ab3, f32x4_mul(a, ab3), mask3); + } + FusedKerSpec::PerRowMin(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_min(f32x4_splat(row[0]), ab0); + ab1 = f32x4_min(f32x4_splat(row[1]), ab1); + ab2 = f32x4_min(f32x4_splat(row[2]), ab2); + ab3 = f32x4_min(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowMax(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_max(f32x4_splat(row[0]), ab0); + ab1 = f32x4_max(f32x4_splat(row[1]), ab1); + ab2 = f32x4_max(f32x4_splat(row[2]), ab2); + ab3 = f32x4_max(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowAdd(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_add(f32x4_splat(row[0]), ab0); + ab1 = f32x4_add(f32x4_splat(row[1]), ab1); + ab2 = f32x4_add(f32x4_splat(row[2]), ab2); + ab3 = f32x4_add(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowMul(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_mul(f32x4_splat(row[0]), ab0); + ab1 = f32x4_mul(f32x4_splat(row[1]), ab1); + ab2 = f32x4_mul(f32x4_splat(row[2]), ab2); + ab3 = f32x4_mul(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowSub(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_sub(f32x4_splat(row[0]), ab0); + ab1 = f32x4_sub(f32x4_splat(row[1]), ab1); + ab2 = f32x4_sub(f32x4_splat(row[2]), ab2); + ab3 = f32x4_sub(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowSubF(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_sub(ab0, f32x4_splat(row[0])); + ab1 = f32x4_sub(ab1, f32x4_splat(row[1])); + ab2 = f32x4_sub(ab2, f32x4_splat(row[2])); + ab3 = f32x4_sub(ab3, f32x4_splat(row[3])); + } + FusedKerSpec::PerColMin(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_min(cols, ab0); + ab1 = f32x4_min(cols, ab1); + ab2 = f32x4_min(cols, ab2); + ab3 = f32x4_min(cols, ab3); + } + FusedKerSpec::PerColMax(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_max(cols, ab0); + ab1 = f32x4_max(cols, ab1); + ab2 = f32x4_max(cols, ab2); + ab3 = f32x4_max(cols, ab3); + } + FusedKerSpec::PerColAdd(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_add(cols, ab0); + ab1 = f32x4_add(cols, ab1); + ab2 = f32x4_add(cols, ab2); + ab3 = f32x4_add(cols, ab3); + } + FusedKerSpec::PerColMul(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_mul(cols, ab0); + ab1 = f32x4_mul(cols, ab1); + ab2 = f32x4_mul(cols, ab2); + ab3 = f32x4_mul(cols, ab3); + } + FusedKerSpec::PerColSub(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_sub(cols, ab0); + ab1 = f32x4_sub(cols, ab1); + ab2 = f32x4_sub(cols, ab2); + ab3 = f32x4_sub(cols, ab3); + } + FusedKerSpec::PerColSubF(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_sub(ab0, cols); + ab1 = f32x4_sub(ab1, cols); + ab2 = f32x4_sub(ab2, cols); + ab3 = f32x4_sub(ab3, cols); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + let scale = f32x4_splat(scaler.scale); + ab0 = f32x4_mul(scale, ab0); + ab1 = f32x4_mul(scale, ab1); + ab2 = f32x4_mul(scale, ab2); + ab3 = f32x4_mul(scale, ab3); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let shift = f32x4_splat(2f32.powi(-(shift as i32))); + ab0 = f32x4_mul(shift, ab0); + ab1 = f32x4_mul(shift, ab1); + ab2 = f32x4_mul(shift, ab2); + ab3 = f32x4_mul(shift, ab3); + } + FusedKerSpec::ShiftLeft(shift) => { + let shift = f32x4_splat(2f32.powi(shift as i32)); + ab0 = f32x4_mul(shift, ab0); + ab1 = f32x4_mul(shift, ab1); + ab2 = f32x4_mul(shift, ab2); + ab3 = f32x4_mul(shift, ab3); + } + FusedKerSpec::AddUnicast(tile) => { + let mut ptr: *const u8 = tile.ptr; + + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + ab0 = f32x4_add(ab0, f32x4(m0, m1, m2, m3)); + ptr = ptr.add(tile.row_byte_stride as usize); + + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + ab1 = f32x4_add(ab1, f32x4(m0, m1, m2, m3)); + ptr = ptr.add(tile.row_byte_stride as usize); + + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + ab2 = f32x4_add(ab2, f32x4(m0, m1, m2, m3)); + ptr = ptr.add(tile.row_byte_stride as usize); + + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + ab3 = f32x4_add(ab3, f32x4(m0, m1, m2, m3)); + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(*rows.add(0)), cols)); + ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(*rows.add(1)), cols)); + ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(*rows.add(2)), cols)); + ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(*rows.add(3)), cols)); + } + FusedKerSpec::Store(tile) => { + let mut ptr: *mut u8 = tile.ptr; + + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab0); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab0); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = + f32x4_extract_lane::<2>(ab0); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = + f32x4_extract_lane::<3>(ab0); + ptr = ptr.add(tile.row_byte_stride as usize); + + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab1); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab1); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = + f32x4_extract_lane::<2>(ab1); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = + f32x4_extract_lane::<3>(ab1); + ptr = ptr.add(tile.row_byte_stride as usize); + + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab2); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab2); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = + f32x4_extract_lane::<2>(ab2); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = + f32x4_extract_lane::<3>(ab2); + ptr = ptr.add(tile.row_byte_stride as usize); + + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab3); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab3); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = + f32x4_extract_lane::<2>(ab3); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = + f32x4_extract_lane::<3>(ab3); + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + let a = pa as *const f32; + let b = pb as *const v128; + for i in 0..k { + let a = std::slice::from_raw_parts(a.offset(4 * i as isize), 4); + let b = v128_load(b.offset(i as isize)); + ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(a[0]), b)); + ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(a[1]), b)); + ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(a[2]), b)); + ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(a[3]), b)); + } + } + } + pnl = pnl.add(1); + } + 0 + } +} + +MMMRustKernel!(kernel_f32_4x4 => wasm_f32_4x4(4,4)@(4,4) quality(ImplementationQuality::TargetOptimized)); + +/// WASM SIMD f32 4x1 kernel — GEMV-shaped variant for matrix-vector products +/// (single-column outputs, e.g., streaming-RNN inference where each frame's +/// activation is a single column). Mirrors the 4x4 kernel's FusedKerSpec +/// match arms but collapses the column dimension from 4 to 1: a single +/// f32x4 accumulator holds 4 output rows × 1 output column packed as +/// [ab[0], ab[1], ab[2], ab[3]]. +/// +/// Selection: tract-core's einsum kernel_selection::strategize() prefers +/// kernels with nr() == 1 when op.n.is_one(), so this kernel is +/// automatically picked for N=1 cases once registered. +unsafe fn kernel_f32_4x1(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + unsafe { + // Single accumulator: 4 rows × 1 col, packed into one f32x4. + // lane[i] holds ab[i] = the output value for row i (col 0). + let mut ab = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + ab = f32x4_splat(0.0); + } + FusedKerSpec::LoadTile(_cols, rows) => { + // Tile is 4 rows × 1 col = 4 contiguous f32s = 1 v128 + ab = v128_load(rows as *const v128); + } + FusedKerSpec::ScalarMin(a) => { + ab = f32x4_min(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarMax(a) => { + ab = f32x4_max(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarAdd(a) => { + ab = f32x4_add(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarMul(a) => { + ab = f32x4_mul(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarSub(a) => { + ab = f32x4_sub(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarSubF(a) => { + ab = f32x4_sub(ab, f32x4_splat(a)); + } + FusedKerSpec::LeakyRelu(a) => { + let zero = f32x4_splat(0.0); + let mask = f32x4_gt(ab, zero); + ab = v128_bitselect(ab, f32x4_mul(f32x4_splat(a), ab), mask); + } + FusedKerSpec::PerRowMin(row) => { + // 4 row values, applied to ab's 4 lanes in order + let r = v128_load(row as *const v128); + ab = f32x4_min(r, ab); + } + FusedKerSpec::PerRowMax(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_max(r, ab); + } + FusedKerSpec::PerRowAdd(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_add(r, ab); + } + FusedKerSpec::PerRowMul(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_mul(r, ab); + } + FusedKerSpec::PerRowSub(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_sub(r, ab); + } + FusedKerSpec::PerRowSubF(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_sub(ab, r); + } + FusedKerSpec::PerColMin(cols) => { + // Single col value broadcast to all 4 rows + ab = f32x4_min(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColMax(cols) => { + ab = f32x4_max(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColAdd(cols) => { + ab = f32x4_add(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColMul(cols) => { + ab = f32x4_mul(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColSub(cols) => { + ab = f32x4_sub(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColSubF(cols) => { + ab = f32x4_sub(ab, f32x4_splat(*cols)); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + ab = f32x4_mul(f32x4_splat(scaler.scale), ab); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let s = f32x4_splat(2f32.powi(-(shift as i32))); + ab = f32x4_mul(s, ab); + } + FusedKerSpec::ShiftLeft(shift) => { + let s = f32x4_splat(2f32.powi(shift as i32)); + ab = f32x4_mul(s, ab); + } + FusedKerSpec::AddUnicast(tile) => { + // 4 rows × 1 col, with row_byte_stride between rows (col_stride irrelevant for N=1) + let mut ptr: *const u8 = tile.ptr; + let m0 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m1 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m2 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m3 = *(ptr as *const f32); + ab = f32x4_add(ab, f32x4(m0, m1, m2, m3)); + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + // ab[i] += rows[i] * cols[0] (cols[0] is the single col) + let r = v128_load(rows as *const v128); + let c = f32x4_splat(*cols); + ab = f32x4_add(ab, f32x4_mul(r, c)); + } + FusedKerSpec::Store(tile) => { + // 4 rows × 1 col, write each lane to a separate row + let mut ptr: *mut u8 = tile.ptr; + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab); + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + // A is packed [k][MR=4]: each k iter loads 4 contiguous f32s = 1 v128. + // B is packed [k][NR=1]: each k iter loads 1 scalar f32, broadcast. + // ab[i] += a[i] * b for all i in 0..4 → SIMD: ab += a_vec * b_splat + let a = pa as *const v128; + let b = pb as *const f32; + for i in 0..k { + let a_vec = v128_load(a.offset(i as isize)); + let b_splat = f32x4_splat(*b.offset(i as isize)); + ab = f32x4_add(ab, f32x4_mul(a_vec, b_splat)); + } + } + } + pnl = pnl.add(1); + } + 0 + } +} + +MMMRustKernel!(kernel_f32_4x1 => wasm_f32_4x1(4,1)@(4,1) quality(ImplementationQuality::TargetOptimized)); + +/// WASM SIMD f32 8x1 kernel — wider GEMV variant for matrix-vector products +/// on large M. Uses TWO independent f32x4 accumulators (rows 0-3 in ab_top, +/// rows 4-7 in ab_bot), enabling 2-way ILP within each k-iteration: +/// the inner loop issues two independent f32x4_add(f32x4_mul(...)) ops per +/// k-step, breaking the data-dependency chain depth from K to ~K/2 at the +/// hardware pipeline level. +/// +/// Compared to wasm_f32_4x1 (1 accumulator, k-serial dep chain), this is +/// targeted at GEMV ops where M is a multiple of 8 (or close to it). For +/// M=256 GRU gate matmuls (the dominant GEMV in DFN3), this should yield +/// ~2x speedup on the inner loop on hardware where SIMD FMLA throughput +/// exceeds 1 op/cycle. +/// +/// Selection: `kernel_selection::strategize()` prefers max mr() for n=1 +/// cases, so this kernel automatically wins over wasm_f32_4x1 for all N=1 +/// ops once registered (including small-M cases where it slightly wastes +/// rows — for M=1 lsnr_fc-style ops, that's 7-of-8 row waste, but those +/// ops are <1% of frame so the regression is noise). +unsafe fn kernel_f32_8x1(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + unsafe { + // Two accumulators: 8 rows × 1 col packed as [ab_top, ab_bot] + // ab_top.lane[i] holds row i (i in 0..4); ab_bot.lane[i] holds row i+4 + let mut ab_top = f32x4_splat(0.0); + let mut ab_bot = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + ab_top = f32x4_splat(0.0); + ab_bot = f32x4_splat(0.0); + } + FusedKerSpec::LoadTile(_cols, rows) => { + // 8 rows × 1 col = 8 contiguous f32 = 2 v128 + let p = rows as *const v128; + ab_top = *p; + ab_bot = *p.add(1); + } + FusedKerSpec::ScalarMin(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_min(s, ab_top); + ab_bot = f32x4_min(s, ab_bot); + } + FusedKerSpec::ScalarMax(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_max(s, ab_top); + ab_bot = f32x4_max(s, ab_bot); + } + FusedKerSpec::ScalarAdd(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_add(s, ab_top); + ab_bot = f32x4_add(s, ab_bot); + } + FusedKerSpec::ScalarMul(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_mul(s, ab_top); + ab_bot = f32x4_mul(s, ab_bot); + } + FusedKerSpec::ScalarSub(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_sub(s, ab_top); + ab_bot = f32x4_sub(s, ab_bot); + } + FusedKerSpec::ScalarSubF(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_sub(ab_top, s); + ab_bot = f32x4_sub(ab_bot, s); + } + FusedKerSpec::LeakyRelu(a) => { + let s = f32x4_splat(a); + let zero = f32x4_splat(0.0); + let mask_t = f32x4_gt(ab_top, zero); + let mask_b = f32x4_gt(ab_bot, zero); + ab_top = v128_bitselect(ab_top, f32x4_mul(s, ab_top), mask_t); + ab_bot = v128_bitselect(ab_bot, f32x4_mul(s, ab_bot), mask_b); + } + FusedKerSpec::PerRowMin(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_min(r_t, ab_top); + ab_bot = f32x4_min(r_b, ab_bot); + } + FusedKerSpec::PerRowMax(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_max(r_t, ab_top); + ab_bot = f32x4_max(r_b, ab_bot); + } + FusedKerSpec::PerRowAdd(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_add(r_t, ab_top); + ab_bot = f32x4_add(r_b, ab_bot); + } + FusedKerSpec::PerRowMul(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_mul(r_t, ab_top); + ab_bot = f32x4_mul(r_b, ab_bot); + } + FusedKerSpec::PerRowSub(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_sub(r_t, ab_top); + ab_bot = f32x4_sub(r_b, ab_bot); + } + FusedKerSpec::PerRowSubF(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_sub(ab_top, r_t); + ab_bot = f32x4_sub(ab_bot, r_b); + } + FusedKerSpec::PerColMin(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_min(c, ab_top); + ab_bot = f32x4_min(c, ab_bot); + } + FusedKerSpec::PerColMax(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_max(c, ab_top); + ab_bot = f32x4_max(c, ab_bot); + } + FusedKerSpec::PerColAdd(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_add(c, ab_top); + ab_bot = f32x4_add(c, ab_bot); + } + FusedKerSpec::PerColMul(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_mul(c, ab_top); + ab_bot = f32x4_mul(c, ab_bot); + } + FusedKerSpec::PerColSub(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_sub(c, ab_top); + ab_bot = f32x4_sub(c, ab_bot); + } + FusedKerSpec::PerColSubF(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_sub(ab_top, c); + ab_bot = f32x4_sub(ab_bot, c); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + let s = f32x4_splat(scaler.scale); + ab_top = f32x4_mul(s, ab_top); + ab_bot = f32x4_mul(s, ab_bot); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let s = f32x4_splat(2f32.powi(-(shift as i32))); + ab_top = f32x4_mul(s, ab_top); + ab_bot = f32x4_mul(s, ab_bot); + } + FusedKerSpec::ShiftLeft(shift) => { + let s = f32x4_splat(2f32.powi(shift as i32)); + ab_top = f32x4_mul(s, ab_top); + ab_bot = f32x4_mul(s, ab_bot); + } + FusedKerSpec::AddUnicast(tile) => { + // 8 rows × 1 col, stride is row_byte_stride between rows + let mut ptr: *const u8 = tile.ptr; + let m0 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m1 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m2 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m3 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m4 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m5 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m6 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m7 = *(ptr as *const f32); + ab_top = f32x4_add(ab_top, f32x4(m0, m1, m2, m3)); + ab_bot = f32x4_add(ab_bot, f32x4(m4, m5, m6, m7)); + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + let p = rows as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + let c = f32x4_splat(*cols); + ab_top = f32x4_add(ab_top, f32x4_mul(r_t, c)); + ab_bot = f32x4_add(ab_bot, f32x4_mul(r_b, c)); + } + FusedKerSpec::Store(tile) => { + // 8 rows × 1 col, write each lane to a separate row + let mut ptr: *mut u8 = tile.ptr; + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab_top); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab_top); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab_top); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab_top); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab_bot); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab_bot); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab_bot); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab_bot); + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + // A: packed [k][MR=8] = each k iter loads 8 f32 = 2 v128 + // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast + // The two fmadd ops on (ab_top, ab_bot) are independent — 2-way ILP per iter. + let a = pa as *const v128; + let b = pb as *const f32; + for i in 0..k { + let a_t = v128_load(a.offset((2 * i) as isize)); + let a_b = v128_load(a.offset((2 * i + 1) as isize)); + let b_splat = f32x4_splat(*b.offset(i as isize)); + ab_top = f32x4_add(ab_top, f32x4_mul(a_t, b_splat)); + ab_bot = f32x4_add(ab_bot, f32x4_mul(a_b, b_splat)); + } + } + } + pnl = pnl.add(1); + } + 0 + } +} + +MMMRustKernel!(kernel_f32_8x1 => wasm_f32_8x1(8,1)@(8,1) quality(ImplementationQuality::TargetOptimized)); + +/// WASM SIMD f32 16x1 kernel — wider GEMV variant for matrix-vector products +/// on very large M. Uses FOUR independent f32x4 accumulators (rows 0-3, +/// 4-7, 8-11, 12-15), enabling 4-way ILP within each k-iteration. +/// +/// Compared to wasm_f32_8x1 (2 accumulators, 2-way ILP), this exposes more +/// parallel work to the SIMD pipelines, beneficial on hardware with 3+ +/// SIMD execution units (most modern ARM and x86). +unsafe fn kernel_f32_16x1(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + unsafe { + // Four accumulators: 16 rows × 1 col packed as [ab_q0, ab_q1, ab_q2, ab_q3] + // ab_q0 = rows 0-3, ab_q1 = rows 4-7, ab_q2 = rows 8-11, ab_q3 = rows 12-15 + let mut ab_q0 = f32x4_splat(0.0); + let mut ab_q1 = f32x4_splat(0.0); + let mut ab_q2 = f32x4_splat(0.0); + let mut ab_q3 = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + let z = f32x4_splat(0.0); + ab_q0 = z; + ab_q1 = z; + ab_q2 = z; + ab_q3 = z; + } + FusedKerSpec::LoadTile(_cols, rows) => { + let p = rows as *const v128; + ab_q0 = *p; + ab_q1 = *p.add(1); + ab_q2 = *p.add(2); + ab_q3 = *p.add(3); + } + FusedKerSpec::ScalarMin(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_min(s, ab_q0); + ab_q1 = f32x4_min(s, ab_q1); + ab_q2 = f32x4_min(s, ab_q2); + ab_q3 = f32x4_min(s, ab_q3); + } + FusedKerSpec::ScalarMax(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_max(s, ab_q0); + ab_q1 = f32x4_max(s, ab_q1); + ab_q2 = f32x4_max(s, ab_q2); + ab_q3 = f32x4_max(s, ab_q3); + } + FusedKerSpec::ScalarAdd(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_add(s, ab_q0); + ab_q1 = f32x4_add(s, ab_q1); + ab_q2 = f32x4_add(s, ab_q2); + ab_q3 = f32x4_add(s, ab_q3); + } + FusedKerSpec::ScalarMul(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_mul(s, ab_q0); + ab_q1 = f32x4_mul(s, ab_q1); + ab_q2 = f32x4_mul(s, ab_q2); + ab_q3 = f32x4_mul(s, ab_q3); + } + FusedKerSpec::ScalarSub(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_sub(s, ab_q0); + ab_q1 = f32x4_sub(s, ab_q1); + ab_q2 = f32x4_sub(s, ab_q2); + ab_q3 = f32x4_sub(s, ab_q3); + } + FusedKerSpec::ScalarSubF(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_sub(ab_q0, s); + ab_q1 = f32x4_sub(ab_q1, s); + ab_q2 = f32x4_sub(ab_q2, s); + ab_q3 = f32x4_sub(ab_q3, s); + } + FusedKerSpec::LeakyRelu(a) => { + let s = f32x4_splat(a); + let zero = f32x4_splat(0.0); + let m0 = f32x4_gt(ab_q0, zero); + ab_q0 = v128_bitselect(ab_q0, f32x4_mul(s, ab_q0), m0); + let m1 = f32x4_gt(ab_q1, zero); + ab_q1 = v128_bitselect(ab_q1, f32x4_mul(s, ab_q1), m1); + let m2 = f32x4_gt(ab_q2, zero); + ab_q2 = v128_bitselect(ab_q2, f32x4_mul(s, ab_q2), m2); + let m3 = f32x4_gt(ab_q3, zero); + ab_q3 = v128_bitselect(ab_q3, f32x4_mul(s, ab_q3), m3); + } + FusedKerSpec::PerRowMin(row) => { + let p = row as *const v128; + ab_q0 = f32x4_min(v128_load(p), ab_q0); + ab_q1 = f32x4_min(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_min(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_min(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowMax(row) => { + let p = row as *const v128; + ab_q0 = f32x4_max(v128_load(p), ab_q0); + ab_q1 = f32x4_max(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_max(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_max(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowAdd(row) => { + let p = row as *const v128; + ab_q0 = f32x4_add(v128_load(p), ab_q0); + ab_q1 = f32x4_add(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_add(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_add(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowMul(row) => { + let p = row as *const v128; + ab_q0 = f32x4_mul(v128_load(p), ab_q0); + ab_q1 = f32x4_mul(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_mul(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_mul(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowSub(row) => { + let p = row as *const v128; + ab_q0 = f32x4_sub(v128_load(p), ab_q0); + ab_q1 = f32x4_sub(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_sub(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_sub(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowSubF(row) => { + let p = row as *const v128; + ab_q0 = f32x4_sub(ab_q0, v128_load(p)); + ab_q1 = f32x4_sub(ab_q1, v128_load(p.add(1))); + ab_q2 = f32x4_sub(ab_q2, v128_load(p.add(2))); + ab_q3 = f32x4_sub(ab_q3, v128_load(p.add(3))); + } + FusedKerSpec::PerColMin(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_min(c, ab_q0); + ab_q1 = f32x4_min(c, ab_q1); + ab_q2 = f32x4_min(c, ab_q2); + ab_q3 = f32x4_min(c, ab_q3); + } + FusedKerSpec::PerColMax(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_max(c, ab_q0); + ab_q1 = f32x4_max(c, ab_q1); + ab_q2 = f32x4_max(c, ab_q2); + ab_q3 = f32x4_max(c, ab_q3); + } + FusedKerSpec::PerColAdd(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_add(c, ab_q0); + ab_q1 = f32x4_add(c, ab_q1); + ab_q2 = f32x4_add(c, ab_q2); + ab_q3 = f32x4_add(c, ab_q3); + } + FusedKerSpec::PerColMul(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_mul(c, ab_q0); + ab_q1 = f32x4_mul(c, ab_q1); + ab_q2 = f32x4_mul(c, ab_q2); + ab_q3 = f32x4_mul(c, ab_q3); + } + FusedKerSpec::PerColSub(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_sub(c, ab_q0); + ab_q1 = f32x4_sub(c, ab_q1); + ab_q2 = f32x4_sub(c, ab_q2); + ab_q3 = f32x4_sub(c, ab_q3); + } + FusedKerSpec::PerColSubF(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_sub(ab_q0, c); + ab_q1 = f32x4_sub(ab_q1, c); + ab_q2 = f32x4_sub(ab_q2, c); + ab_q3 = f32x4_sub(ab_q3, c); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + let s = f32x4_splat(scaler.scale); + ab_q0 = f32x4_mul(s, ab_q0); + ab_q1 = f32x4_mul(s, ab_q1); + ab_q2 = f32x4_mul(s, ab_q2); + ab_q3 = f32x4_mul(s, ab_q3); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let s = f32x4_splat(2f32.powi(-(shift as i32))); + ab_q0 = f32x4_mul(s, ab_q0); + ab_q1 = f32x4_mul(s, ab_q1); + ab_q2 = f32x4_mul(s, ab_q2); + ab_q3 = f32x4_mul(s, ab_q3); + } + FusedKerSpec::ShiftLeft(shift) => { + let s = f32x4_splat(2f32.powi(shift as i32)); + ab_q0 = f32x4_mul(s, ab_q0); + ab_q1 = f32x4_mul(s, ab_q1); + ab_q2 = f32x4_mul(s, ab_q2); + ab_q3 = f32x4_mul(s, ab_q3); + } + FusedKerSpec::AddUnicast(tile) => { + // 16 rows × 1 col, with row_byte_stride between rows + let mut ptr: *const u8 = tile.ptr; + let mut ms = [0f32; 16]; + for i in 0..16 { + ms[i] = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + } + ab_q0 = f32x4_add(ab_q0, f32x4(ms[0], ms[1], ms[2], ms[3])); + ab_q1 = f32x4_add(ab_q1, f32x4(ms[4], ms[5], ms[6], ms[7])); + ab_q2 = f32x4_add(ab_q2, f32x4(ms[8], ms[9], ms[10], ms[11])); + ab_q3 = f32x4_add(ab_q3, f32x4(ms[12], ms[13], ms[14], ms[15])); + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + let p = rows as *const v128; + let c = f32x4_splat(*cols); + ab_q0 = f32x4_add(ab_q0, f32x4_mul(v128_load(p), c)); + ab_q1 = f32x4_add(ab_q1, f32x4_mul(v128_load(p.add(1)), c)); + ab_q2 = f32x4_add(ab_q2, f32x4_mul(v128_load(p.add(2)), c)); + ab_q3 = f32x4_add(ab_q3, f32x4_mul(v128_load(p.add(3)), c)); + } + FusedKerSpec::Store(tile) => { + // 16 rows × 1 col, write each lane to a separate row + let mut ptr: *mut u8 = tile.ptr; + for ab in [ab_q0, ab_q1, ab_q2, ab_q3].iter() { + *(ptr as *mut f32) = f32x4_extract_lane::<0>(*ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<1>(*ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<2>(*ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<3>(*ab); + ptr = ptr.add(tile.row_byte_stride as usize); + } + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + // A: packed [k][MR=16] = each k iter loads 16 f32 = 4 v128 + // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast + // 4 INDEPENDENT fmadds per k-iter — 4-way ILP + let a = pa as *const v128; + let b = pb as *const f32; + for i in 0..k { + let a0 = v128_load(a.offset((4 * i) as isize)); + let a1 = v128_load(a.offset((4 * i + 1) as isize)); + let a2 = v128_load(a.offset((4 * i + 2) as isize)); + let a3 = v128_load(a.offset((4 * i + 3) as isize)); + let bs = f32x4_splat(*b.offset(i as isize)); + ab_q0 = f32x4_add(ab_q0, f32x4_mul(a0, bs)); + ab_q1 = f32x4_add(ab_q1, f32x4_mul(a1, bs)); + ab_q2 = f32x4_add(ab_q2, f32x4_mul(a2, bs)); + ab_q3 = f32x4_add(ab_q3, f32x4_mul(a3, bs)); + } + } + } + pnl = pnl.add(1); + } + 0 + } +} + +MMMRustKernel!(kernel_f32_16x1 => wasm_f32_16x1(16,1)@(16,1) quality(ImplementationQuality::TargetOptimized)); + +/// WASM SIMD f32 8x8 kernel — wide MM tile (8 rows × 8 cols, 16 v128 accumulators). +/// Each row uses 2 v128: cols 0-3 in `_lo`, cols 4-7 in `_hi`. 16 accumulators +/// is at the limit of WASM's 16 logical SIMD register slots; this tests the +/// register-pressure boundary. For DFN3 ops, all M and N are multiples of 8, +/// so 8x8 fits cleanly with no padding waste. +unsafe fn kernel_f32_8x8(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + unsafe { + // 8 rows × 8 cols = 16 f32x4 accumulators (cols 0-3 in _lo, cols 4-7 in _hi) + let mut a0lo = f32x4_splat(0.0); + let mut a0hi = f32x4_splat(0.0); + let mut a1lo = f32x4_splat(0.0); + let mut a1hi = f32x4_splat(0.0); + let mut a2lo = f32x4_splat(0.0); + let mut a2hi = f32x4_splat(0.0); + let mut a3lo = f32x4_splat(0.0); + let mut a3hi = f32x4_splat(0.0); + let mut a4lo = f32x4_splat(0.0); + let mut a4hi = f32x4_splat(0.0); + let mut a5lo = f32x4_splat(0.0); + let mut a5hi = f32x4_splat(0.0); + let mut a6lo = f32x4_splat(0.0); + let mut a6hi = f32x4_splat(0.0); + let mut a7lo = f32x4_splat(0.0); + let mut a7hi = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + let z = f32x4_splat(0.0); + a0lo = z; + a0hi = z; + a1lo = z; + a1hi = z; + a2lo = z; + a2hi = z; + a3lo = z; + a3hi = z; + a4lo = z; + a4hi = z; + a5lo = z; + a5hi = z; + a6lo = z; + a6hi = z; + a7lo = z; + a7hi = z; + } + FusedKerSpec::LoadTile(_cols, rows) => { + // 8 rows × 8 cols = 16 v128 (2 per row, contiguous lo+hi) + let p = rows as *const v128; + a0lo = *p.add(0); + a0hi = *p.add(1); + a1lo = *p.add(2); + a1hi = *p.add(3); + a2lo = *p.add(4); + a2hi = *p.add(5); + a3lo = *p.add(6); + a3hi = *p.add(7); + a4lo = *p.add(8); + a4hi = *p.add(9); + a5lo = *p.add(10); + a5hi = *p.add(11); + a6lo = *p.add(12); + a6hi = *p.add(13); + a7lo = *p.add(14); + a7hi = *p.add(15); + } + FusedKerSpec::ScalarMin(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_min(s, a0lo); + a0hi = f32x4_min(s, a0hi); + a1lo = f32x4_min(s, a1lo); + a1hi = f32x4_min(s, a1hi); + a2lo = f32x4_min(s, a2lo); + a2hi = f32x4_min(s, a2hi); + a3lo = f32x4_min(s, a3lo); + a3hi = f32x4_min(s, a3hi); + a4lo = f32x4_min(s, a4lo); + a4hi = f32x4_min(s, a4hi); + a5lo = f32x4_min(s, a5lo); + a5hi = f32x4_min(s, a5hi); + a6lo = f32x4_min(s, a6lo); + a6hi = f32x4_min(s, a6hi); + a7lo = f32x4_min(s, a7lo); + a7hi = f32x4_min(s, a7hi); + } + FusedKerSpec::ScalarMax(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_max(s, a0lo); + a0hi = f32x4_max(s, a0hi); + a1lo = f32x4_max(s, a1lo); + a1hi = f32x4_max(s, a1hi); + a2lo = f32x4_max(s, a2lo); + a2hi = f32x4_max(s, a2hi); + a3lo = f32x4_max(s, a3lo); + a3hi = f32x4_max(s, a3hi); + a4lo = f32x4_max(s, a4lo); + a4hi = f32x4_max(s, a4hi); + a5lo = f32x4_max(s, a5lo); + a5hi = f32x4_max(s, a5hi); + a6lo = f32x4_max(s, a6lo); + a6hi = f32x4_max(s, a6hi); + a7lo = f32x4_max(s, a7lo); + a7hi = f32x4_max(s, a7hi); + } + FusedKerSpec::ScalarAdd(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_add(s, a0lo); + a0hi = f32x4_add(s, a0hi); + a1lo = f32x4_add(s, a1lo); + a1hi = f32x4_add(s, a1hi); + a2lo = f32x4_add(s, a2lo); + a2hi = f32x4_add(s, a2hi); + a3lo = f32x4_add(s, a3lo); + a3hi = f32x4_add(s, a3hi); + a4lo = f32x4_add(s, a4lo); + a4hi = f32x4_add(s, a4hi); + a5lo = f32x4_add(s, a5lo); + a5hi = f32x4_add(s, a5hi); + a6lo = f32x4_add(s, a6lo); + a6hi = f32x4_add(s, a6hi); + a7lo = f32x4_add(s, a7lo); + a7hi = f32x4_add(s, a7hi); + } + FusedKerSpec::ScalarMul(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_mul(s, a0lo); + a0hi = f32x4_mul(s, a0hi); + a1lo = f32x4_mul(s, a1lo); + a1hi = f32x4_mul(s, a1hi); + a2lo = f32x4_mul(s, a2lo); + a2hi = f32x4_mul(s, a2hi); + a3lo = f32x4_mul(s, a3lo); + a3hi = f32x4_mul(s, a3hi); + a4lo = f32x4_mul(s, a4lo); + a4hi = f32x4_mul(s, a4hi); + a5lo = f32x4_mul(s, a5lo); + a5hi = f32x4_mul(s, a5hi); + a6lo = f32x4_mul(s, a6lo); + a6hi = f32x4_mul(s, a6hi); + a7lo = f32x4_mul(s, a7lo); + a7hi = f32x4_mul(s, a7hi); + } + FusedKerSpec::ScalarSub(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_sub(s, a0lo); + a0hi = f32x4_sub(s, a0hi); + a1lo = f32x4_sub(s, a1lo); + a1hi = f32x4_sub(s, a1hi); + a2lo = f32x4_sub(s, a2lo); + a2hi = f32x4_sub(s, a2hi); + a3lo = f32x4_sub(s, a3lo); + a3hi = f32x4_sub(s, a3hi); + a4lo = f32x4_sub(s, a4lo); + a4hi = f32x4_sub(s, a4hi); + a5lo = f32x4_sub(s, a5lo); + a5hi = f32x4_sub(s, a5hi); + a6lo = f32x4_sub(s, a6lo); + a6hi = f32x4_sub(s, a6hi); + a7lo = f32x4_sub(s, a7lo); + a7hi = f32x4_sub(s, a7hi); + } + FusedKerSpec::ScalarSubF(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_sub(a0lo, s); + a0hi = f32x4_sub(a0hi, s); + a1lo = f32x4_sub(a1lo, s); + a1hi = f32x4_sub(a1hi, s); + a2lo = f32x4_sub(a2lo, s); + a2hi = f32x4_sub(a2hi, s); + a3lo = f32x4_sub(a3lo, s); + a3hi = f32x4_sub(a3hi, s); + a4lo = f32x4_sub(a4lo, s); + a4hi = f32x4_sub(a4hi, s); + a5lo = f32x4_sub(a5lo, s); + a5hi = f32x4_sub(a5hi, s); + a6lo = f32x4_sub(a6lo, s); + a6hi = f32x4_sub(a6hi, s); + a7lo = f32x4_sub(a7lo, s); + a7hi = f32x4_sub(a7hi, s); + } + FusedKerSpec::LeakyRelu(a) => { + let s = f32x4_splat(a); + let zero = f32x4_splat(0.0); + let m0a = f32x4_gt(a0lo, zero); + a0lo = v128_bitselect(a0lo, f32x4_mul(s, a0lo), m0a); + let m0b = f32x4_gt(a0hi, zero); + a0hi = v128_bitselect(a0hi, f32x4_mul(s, a0hi), m0b); + let m1a = f32x4_gt(a1lo, zero); + a1lo = v128_bitselect(a1lo, f32x4_mul(s, a1lo), m1a); + let m1b = f32x4_gt(a1hi, zero); + a1hi = v128_bitselect(a1hi, f32x4_mul(s, a1hi), m1b); + let m2a = f32x4_gt(a2lo, zero); + a2lo = v128_bitselect(a2lo, f32x4_mul(s, a2lo), m2a); + let m2b = f32x4_gt(a2hi, zero); + a2hi = v128_bitselect(a2hi, f32x4_mul(s, a2hi), m2b); + let m3a = f32x4_gt(a3lo, zero); + a3lo = v128_bitselect(a3lo, f32x4_mul(s, a3lo), m3a); + let m3b = f32x4_gt(a3hi, zero); + a3hi = v128_bitselect(a3hi, f32x4_mul(s, a3hi), m3b); + let m4a = f32x4_gt(a4lo, zero); + a4lo = v128_bitselect(a4lo, f32x4_mul(s, a4lo), m4a); + let m4b = f32x4_gt(a4hi, zero); + a4hi = v128_bitselect(a4hi, f32x4_mul(s, a4hi), m4b); + let m5a = f32x4_gt(a5lo, zero); + a5lo = v128_bitselect(a5lo, f32x4_mul(s, a5lo), m5a); + let m5b = f32x4_gt(a5hi, zero); + a5hi = v128_bitselect(a5hi, f32x4_mul(s, a5hi), m5b); + let m6a = f32x4_gt(a6lo, zero); + a6lo = v128_bitselect(a6lo, f32x4_mul(s, a6lo), m6a); + let m6b = f32x4_gt(a6hi, zero); + a6hi = v128_bitselect(a6hi, f32x4_mul(s, a6hi), m6b); + let m7a = f32x4_gt(a7lo, zero); + a7lo = v128_bitselect(a7lo, f32x4_mul(s, a7lo), m7a); + let m7b = f32x4_gt(a7hi, zero); + a7hi = v128_bitselect(a7hi, f32x4_mul(s, a7hi), m7b); + } + FusedKerSpec::PerRowMin(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); + a0lo = f32x4_min(r0, a0lo); + a0hi = f32x4_min(r0, a0hi); + let r1 = f32x4_splat(r[1]); + a1lo = f32x4_min(r1, a1lo); + a1hi = f32x4_min(r1, a1hi); + let r2 = f32x4_splat(r[2]); + a2lo = f32x4_min(r2, a2lo); + a2hi = f32x4_min(r2, a2hi); + let r3 = f32x4_splat(r[3]); + a3lo = f32x4_min(r3, a3lo); + a3hi = f32x4_min(r3, a3hi); + let r4 = f32x4_splat(r[4]); + a4lo = f32x4_min(r4, a4lo); + a4hi = f32x4_min(r4, a4hi); + let r5 = f32x4_splat(r[5]); + a5lo = f32x4_min(r5, a5lo); + a5hi = f32x4_min(r5, a5hi); + let r6 = f32x4_splat(r[6]); + a6lo = f32x4_min(r6, a6lo); + a6hi = f32x4_min(r6, a6hi); + let r7 = f32x4_splat(r[7]); + a7lo = f32x4_min(r7, a7lo); + a7hi = f32x4_min(r7, a7hi); + } + FusedKerSpec::PerRowMax(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); + a0lo = f32x4_max(r0, a0lo); + a0hi = f32x4_max(r0, a0hi); + let r1 = f32x4_splat(r[1]); + a1lo = f32x4_max(r1, a1lo); + a1hi = f32x4_max(r1, a1hi); + let r2 = f32x4_splat(r[2]); + a2lo = f32x4_max(r2, a2lo); + a2hi = f32x4_max(r2, a2hi); + let r3 = f32x4_splat(r[3]); + a3lo = f32x4_max(r3, a3lo); + a3hi = f32x4_max(r3, a3hi); + let r4 = f32x4_splat(r[4]); + a4lo = f32x4_max(r4, a4lo); + a4hi = f32x4_max(r4, a4hi); + let r5 = f32x4_splat(r[5]); + a5lo = f32x4_max(r5, a5lo); + a5hi = f32x4_max(r5, a5hi); + let r6 = f32x4_splat(r[6]); + a6lo = f32x4_max(r6, a6lo); + a6hi = f32x4_max(r6, a6hi); + let r7 = f32x4_splat(r[7]); + a7lo = f32x4_max(r7, a7lo); + a7hi = f32x4_max(r7, a7hi); + } + FusedKerSpec::PerRowAdd(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); + a0lo = f32x4_add(r0, a0lo); + a0hi = f32x4_add(r0, a0hi); + let r1 = f32x4_splat(r[1]); + a1lo = f32x4_add(r1, a1lo); + a1hi = f32x4_add(r1, a1hi); + let r2 = f32x4_splat(r[2]); + a2lo = f32x4_add(r2, a2lo); + a2hi = f32x4_add(r2, a2hi); + let r3 = f32x4_splat(r[3]); + a3lo = f32x4_add(r3, a3lo); + a3hi = f32x4_add(r3, a3hi); + let r4 = f32x4_splat(r[4]); + a4lo = f32x4_add(r4, a4lo); + a4hi = f32x4_add(r4, a4hi); + let r5 = f32x4_splat(r[5]); + a5lo = f32x4_add(r5, a5lo); + a5hi = f32x4_add(r5, a5hi); + let r6 = f32x4_splat(r[6]); + a6lo = f32x4_add(r6, a6lo); + a6hi = f32x4_add(r6, a6hi); + let r7 = f32x4_splat(r[7]); + a7lo = f32x4_add(r7, a7lo); + a7hi = f32x4_add(r7, a7hi); + } + FusedKerSpec::PerRowMul(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); + a0lo = f32x4_mul(r0, a0lo); + a0hi = f32x4_mul(r0, a0hi); + let r1 = f32x4_splat(r[1]); + a1lo = f32x4_mul(r1, a1lo); + a1hi = f32x4_mul(r1, a1hi); + let r2 = f32x4_splat(r[2]); + a2lo = f32x4_mul(r2, a2lo); + a2hi = f32x4_mul(r2, a2hi); + let r3 = f32x4_splat(r[3]); + a3lo = f32x4_mul(r3, a3lo); + a3hi = f32x4_mul(r3, a3hi); + let r4 = f32x4_splat(r[4]); + a4lo = f32x4_mul(r4, a4lo); + a4hi = f32x4_mul(r4, a4hi); + let r5 = f32x4_splat(r[5]); + a5lo = f32x4_mul(r5, a5lo); + a5hi = f32x4_mul(r5, a5hi); + let r6 = f32x4_splat(r[6]); + a6lo = f32x4_mul(r6, a6lo); + a6hi = f32x4_mul(r6, a6hi); + let r7 = f32x4_splat(r[7]); + a7lo = f32x4_mul(r7, a7lo); + a7hi = f32x4_mul(r7, a7hi); + } + FusedKerSpec::PerRowSub(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); + a0lo = f32x4_sub(r0, a0lo); + a0hi = f32x4_sub(r0, a0hi); + let r1 = f32x4_splat(r[1]); + a1lo = f32x4_sub(r1, a1lo); + a1hi = f32x4_sub(r1, a1hi); + let r2 = f32x4_splat(r[2]); + a2lo = f32x4_sub(r2, a2lo); + a2hi = f32x4_sub(r2, a2hi); + let r3 = f32x4_splat(r[3]); + a3lo = f32x4_sub(r3, a3lo); + a3hi = f32x4_sub(r3, a3hi); + let r4 = f32x4_splat(r[4]); + a4lo = f32x4_sub(r4, a4lo); + a4hi = f32x4_sub(r4, a4hi); + let r5 = f32x4_splat(r[5]); + a5lo = f32x4_sub(r5, a5lo); + a5hi = f32x4_sub(r5, a5hi); + let r6 = f32x4_splat(r[6]); + a6lo = f32x4_sub(r6, a6lo); + a6hi = f32x4_sub(r6, a6hi); + let r7 = f32x4_splat(r[7]); + a7lo = f32x4_sub(r7, a7lo); + a7hi = f32x4_sub(r7, a7hi); + } + FusedKerSpec::PerRowSubF(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); + a0lo = f32x4_sub(a0lo, r0); + a0hi = f32x4_sub(a0hi, r0); + let r1 = f32x4_splat(r[1]); + a1lo = f32x4_sub(a1lo, r1); + a1hi = f32x4_sub(a1hi, r1); + let r2 = f32x4_splat(r[2]); + a2lo = f32x4_sub(a2lo, r2); + a2hi = f32x4_sub(a2hi, r2); + let r3 = f32x4_splat(r[3]); + a3lo = f32x4_sub(a3lo, r3); + a3hi = f32x4_sub(a3hi, r3); + let r4 = f32x4_splat(r[4]); + a4lo = f32x4_sub(a4lo, r4); + a4hi = f32x4_sub(a4hi, r4); + let r5 = f32x4_splat(r[5]); + a5lo = f32x4_sub(a5lo, r5); + a5hi = f32x4_sub(a5hi, r5); + let r6 = f32x4_splat(r[6]); + a6lo = f32x4_sub(a6lo, r6); + a6hi = f32x4_sub(a6hi, r6); + let r7 = f32x4_splat(r[7]); + a7lo = f32x4_sub(a7lo, r7); + a7hi = f32x4_sub(a7hi, r7); + } + FusedKerSpec::PerColMin(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); + let chi = v128_load(p.add(1)); + a0lo = f32x4_min(clo, a0lo); + a0hi = f32x4_min(chi, a0hi); + a1lo = f32x4_min(clo, a1lo); + a1hi = f32x4_min(chi, a1hi); + a2lo = f32x4_min(clo, a2lo); + a2hi = f32x4_min(chi, a2hi); + a3lo = f32x4_min(clo, a3lo); + a3hi = f32x4_min(chi, a3hi); + a4lo = f32x4_min(clo, a4lo); + a4hi = f32x4_min(chi, a4hi); + a5lo = f32x4_min(clo, a5lo); + a5hi = f32x4_min(chi, a5hi); + a6lo = f32x4_min(clo, a6lo); + a6hi = f32x4_min(chi, a6hi); + a7lo = f32x4_min(clo, a7lo); + a7hi = f32x4_min(chi, a7hi); + } + FusedKerSpec::PerColMax(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); + let chi = v128_load(p.add(1)); + a0lo = f32x4_max(clo, a0lo); + a0hi = f32x4_max(chi, a0hi); + a1lo = f32x4_max(clo, a1lo); + a1hi = f32x4_max(chi, a1hi); + a2lo = f32x4_max(clo, a2lo); + a2hi = f32x4_max(chi, a2hi); + a3lo = f32x4_max(clo, a3lo); + a3hi = f32x4_max(chi, a3hi); + a4lo = f32x4_max(clo, a4lo); + a4hi = f32x4_max(chi, a4hi); + a5lo = f32x4_max(clo, a5lo); + a5hi = f32x4_max(chi, a5hi); + a6lo = f32x4_max(clo, a6lo); + a6hi = f32x4_max(chi, a6hi); + a7lo = f32x4_max(clo, a7lo); + a7hi = f32x4_max(chi, a7hi); + } + FusedKerSpec::PerColAdd(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); + let chi = v128_load(p.add(1)); + a0lo = f32x4_add(clo, a0lo); + a0hi = f32x4_add(chi, a0hi); + a1lo = f32x4_add(clo, a1lo); + a1hi = f32x4_add(chi, a1hi); + a2lo = f32x4_add(clo, a2lo); + a2hi = f32x4_add(chi, a2hi); + a3lo = f32x4_add(clo, a3lo); + a3hi = f32x4_add(chi, a3hi); + a4lo = f32x4_add(clo, a4lo); + a4hi = f32x4_add(chi, a4hi); + a5lo = f32x4_add(clo, a5lo); + a5hi = f32x4_add(chi, a5hi); + a6lo = f32x4_add(clo, a6lo); + a6hi = f32x4_add(chi, a6hi); + a7lo = f32x4_add(clo, a7lo); + a7hi = f32x4_add(chi, a7hi); + } + FusedKerSpec::PerColMul(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); + let chi = v128_load(p.add(1)); + a0lo = f32x4_mul(clo, a0lo); + a0hi = f32x4_mul(chi, a0hi); + a1lo = f32x4_mul(clo, a1lo); + a1hi = f32x4_mul(chi, a1hi); + a2lo = f32x4_mul(clo, a2lo); + a2hi = f32x4_mul(chi, a2hi); + a3lo = f32x4_mul(clo, a3lo); + a3hi = f32x4_mul(chi, a3hi); + a4lo = f32x4_mul(clo, a4lo); + a4hi = f32x4_mul(chi, a4hi); + a5lo = f32x4_mul(clo, a5lo); + a5hi = f32x4_mul(chi, a5hi); + a6lo = f32x4_mul(clo, a6lo); + a6hi = f32x4_mul(chi, a6hi); + a7lo = f32x4_mul(clo, a7lo); + a7hi = f32x4_mul(chi, a7hi); + } + FusedKerSpec::PerColSub(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); + let chi = v128_load(p.add(1)); + a0lo = f32x4_sub(clo, a0lo); + a0hi = f32x4_sub(chi, a0hi); + a1lo = f32x4_sub(clo, a1lo); + a1hi = f32x4_sub(chi, a1hi); + a2lo = f32x4_sub(clo, a2lo); + a2hi = f32x4_sub(chi, a2hi); + a3lo = f32x4_sub(clo, a3lo); + a3hi = f32x4_sub(chi, a3hi); + a4lo = f32x4_sub(clo, a4lo); + a4hi = f32x4_sub(chi, a4hi); + a5lo = f32x4_sub(clo, a5lo); + a5hi = f32x4_sub(chi, a5hi); + a6lo = f32x4_sub(clo, a6lo); + a6hi = f32x4_sub(chi, a6hi); + a7lo = f32x4_sub(clo, a7lo); + a7hi = f32x4_sub(chi, a7hi); + } + FusedKerSpec::PerColSubF(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); + let chi = v128_load(p.add(1)); + a0lo = f32x4_sub(a0lo, clo); + a0hi = f32x4_sub(a0hi, chi); + a1lo = f32x4_sub(a1lo, clo); + a1hi = f32x4_sub(a1hi, chi); + a2lo = f32x4_sub(a2lo, clo); + a2hi = f32x4_sub(a2hi, chi); + a3lo = f32x4_sub(a3lo, clo); + a3hi = f32x4_sub(a3hi, chi); + a4lo = f32x4_sub(a4lo, clo); + a4hi = f32x4_sub(a4hi, chi); + a5lo = f32x4_sub(a5lo, clo); + a5hi = f32x4_sub(a5hi, chi); + a6lo = f32x4_sub(a6lo, clo); + a6hi = f32x4_sub(a6hi, chi); + a7lo = f32x4_sub(a7lo, clo); + a7hi = f32x4_sub(a7hi, chi); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + let s = f32x4_splat(scaler.scale); + a0lo = f32x4_mul(s, a0lo); + a0hi = f32x4_mul(s, a0hi); + a1lo = f32x4_mul(s, a1lo); + a1hi = f32x4_mul(s, a1hi); + a2lo = f32x4_mul(s, a2lo); + a2hi = f32x4_mul(s, a2hi); + a3lo = f32x4_mul(s, a3lo); + a3hi = f32x4_mul(s, a3hi); + a4lo = f32x4_mul(s, a4lo); + a4hi = f32x4_mul(s, a4hi); + a5lo = f32x4_mul(s, a5lo); + a5hi = f32x4_mul(s, a5hi); + a6lo = f32x4_mul(s, a6lo); + a6hi = f32x4_mul(s, a6hi); + a7lo = f32x4_mul(s, a7lo); + a7hi = f32x4_mul(s, a7hi); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let s = f32x4_splat(2f32.powi(-(shift as i32))); + a0lo = f32x4_mul(s, a0lo); + a0hi = f32x4_mul(s, a0hi); + a1lo = f32x4_mul(s, a1lo); + a1hi = f32x4_mul(s, a1hi); + a2lo = f32x4_mul(s, a2lo); + a2hi = f32x4_mul(s, a2hi); + a3lo = f32x4_mul(s, a3lo); + a3hi = f32x4_mul(s, a3hi); + a4lo = f32x4_mul(s, a4lo); + a4hi = f32x4_mul(s, a4hi); + a5lo = f32x4_mul(s, a5lo); + a5hi = f32x4_mul(s, a5hi); + a6lo = f32x4_mul(s, a6lo); + a6hi = f32x4_mul(s, a6hi); + a7lo = f32x4_mul(s, a7lo); + a7hi = f32x4_mul(s, a7hi); + } + FusedKerSpec::ShiftLeft(shift) => { + let s = f32x4_splat(2f32.powi(shift as i32)); + a0lo = f32x4_mul(s, a0lo); + a0hi = f32x4_mul(s, a0hi); + a1lo = f32x4_mul(s, a1lo); + a1hi = f32x4_mul(s, a1hi); + a2lo = f32x4_mul(s, a2lo); + a2hi = f32x4_mul(s, a2hi); + a3lo = f32x4_mul(s, a3lo); + a3hi = f32x4_mul(s, a3hi); + a4lo = f32x4_mul(s, a4lo); + a4hi = f32x4_mul(s, a4hi); + a5lo = f32x4_mul(s, a5lo); + a5hi = f32x4_mul(s, a5hi); + a6lo = f32x4_mul(s, a6lo); + a6hi = f32x4_mul(s, a6hi); + a7lo = f32x4_mul(s, a7lo); + a7hi = f32x4_mul(s, a7hi); + } + FusedKerSpec::AddUnicast(tile) => { + // 8 rows × 8 cols, each row laid out per col_byte_stride + let mut ptr: *const u8 = tile.ptr; + for ab_pair in [ + (&mut a0lo, &mut a0hi), + (&mut a1lo, &mut a1hi), + (&mut a2lo, &mut a2hi), + (&mut a3lo, &mut a3hi), + (&mut a4lo, &mut a4hi), + (&mut a5lo, &mut a5hi), + (&mut a6lo, &mut a6hi), + (&mut a7lo, &mut a7hi), + ] + .iter_mut() + { + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + let m4 = *(ptr.offset(tile.col_byte_stride * 4) as *const f32); + let m5 = *(ptr.offset(tile.col_byte_stride * 5) as *const f32); + let m6 = *(ptr.offset(tile.col_byte_stride * 6) as *const f32); + let m7 = *(ptr.offset(tile.col_byte_stride * 7) as *const f32); + let (lo, hi) = ab_pair; + **lo = f32x4_add(**lo, f32x4(m0, m1, m2, m3)); + **hi = f32x4_add(**hi, f32x4(m4, m5, m6, m7)); + ptr = ptr.add(tile.row_byte_stride as usize); + } + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + let p = cols as *const v128; + let clo = v128_load(p); + let chi = v128_load(p.add(1)); + let r0 = f32x4_splat(*rows.add(0)); + a0lo = f32x4_add(a0lo, f32x4_mul(r0, clo)); + a0hi = f32x4_add(a0hi, f32x4_mul(r0, chi)); + let r1 = f32x4_splat(*rows.add(1)); + a1lo = f32x4_add(a1lo, f32x4_mul(r1, clo)); + a1hi = f32x4_add(a1hi, f32x4_mul(r1, chi)); + let r2 = f32x4_splat(*rows.add(2)); + a2lo = f32x4_add(a2lo, f32x4_mul(r2, clo)); + a2hi = f32x4_add(a2hi, f32x4_mul(r2, chi)); + let r3 = f32x4_splat(*rows.add(3)); + a3lo = f32x4_add(a3lo, f32x4_mul(r3, clo)); + a3hi = f32x4_add(a3hi, f32x4_mul(r3, chi)); + let r4 = f32x4_splat(*rows.add(4)); + a4lo = f32x4_add(a4lo, f32x4_mul(r4, clo)); + a4hi = f32x4_add(a4hi, f32x4_mul(r4, chi)); + let r5 = f32x4_splat(*rows.add(5)); + a5lo = f32x4_add(a5lo, f32x4_mul(r5, clo)); + a5hi = f32x4_add(a5hi, f32x4_mul(r5, chi)); + let r6 = f32x4_splat(*rows.add(6)); + a6lo = f32x4_add(a6lo, f32x4_mul(r6, clo)); + a6hi = f32x4_add(a6hi, f32x4_mul(r6, chi)); + let r7 = f32x4_splat(*rows.add(7)); + a7lo = f32x4_add(a7lo, f32x4_mul(r7, clo)); + a7hi = f32x4_add(a7hi, f32x4_mul(r7, chi)); + } + FusedKerSpec::Store(tile) => { + // 8 rows × 8 cols stores + let mut ptr: *mut u8 = tile.ptr; + for (lo, hi) in [ + (a0lo, a0hi), + (a1lo, a1hi), + (a2lo, a2hi), + (a3lo, a3hi), + (a4lo, a4hi), + (a5lo, a5hi), + (a6lo, a6hi), + (a7lo, a7hi), + ] + .iter() + { + *(ptr as *mut f32) = f32x4_extract_lane::<0>(*lo); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = + f32x4_extract_lane::<1>(*lo); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = + f32x4_extract_lane::<2>(*lo); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = + f32x4_extract_lane::<3>(*lo); + *(ptr.offset(tile.col_byte_stride * 4) as *mut f32) = + f32x4_extract_lane::<0>(*hi); + *(ptr.offset(tile.col_byte_stride * 5) as *mut f32) = + f32x4_extract_lane::<1>(*hi); + *(ptr.offset(tile.col_byte_stride * 6) as *mut f32) = + f32x4_extract_lane::<2>(*hi); + *(ptr.offset(tile.col_byte_stride * 7) as *mut f32) = + f32x4_extract_lane::<3>(*hi); + ptr = ptr.add(tile.row_byte_stride as usize); + } + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + // A: packed [k][MR=8] = each k iter loads 8 row values + // B: packed [k][NR=8] = each k iter loads 8 col values as 2 v128 + let a = pa as *const f32; + let b = pb as *const v128; + for i in 0..k { + let arow = std::slice::from_raw_parts(a.offset(8 * i as isize), 8); + let blo = v128_load(b.offset((2 * i) as isize)); + let bhi = v128_load(b.offset((2 * i + 1) as isize)); + let s = f32x4_splat(arow[0]); + a0lo = f32x4_add(a0lo, f32x4_mul(s, blo)); + a0hi = f32x4_add(a0hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[1]); + a1lo = f32x4_add(a1lo, f32x4_mul(s, blo)); + a1hi = f32x4_add(a1hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[2]); + a2lo = f32x4_add(a2lo, f32x4_mul(s, blo)); + a2hi = f32x4_add(a2hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[3]); + a3lo = f32x4_add(a3lo, f32x4_mul(s, blo)); + a3hi = f32x4_add(a3hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[4]); + a4lo = f32x4_add(a4lo, f32x4_mul(s, blo)); + a4hi = f32x4_add(a4hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[5]); + a5lo = f32x4_add(a5lo, f32x4_mul(s, blo)); + a5hi = f32x4_add(a5hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[6]); + a6lo = f32x4_add(a6lo, f32x4_mul(s, blo)); + a6hi = f32x4_add(a6hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[7]); + a7lo = f32x4_add(a7lo, f32x4_mul(s, blo)); + a7hi = f32x4_add(a7hi, f32x4_mul(s, bhi)); + } + } + } + pnl = pnl.add(1); + } + 0 + } +} + +MMMRustKernel!(kernel_f32_8x8 => wasm_f32_8x8(8,8)@(8,8) quality(ImplementationQuality::TargetOptimized)); diff --git a/vendor/tract-linalg-0.22.1/src/wasm.rs.with-8x4 b/vendor/tract-linalg-0.22.1/src/wasm.rs.with-8x4 new file mode 100644 index 000000000..6b4a25e85 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/wasm.rs.with-8x4 @@ -0,0 +1,1555 @@ +/// Wasm SIMD implementation of `MatMatMulKer` +/// +/// To run test, you need to install `wasmtime` +/// and export the following environment variables: +/// ``` +/// > export RUSTFLAGS='-C target-feature=+simd128' +/// > export CARGO_TARGET_WASM32_WASI_RUNNER=wasmtime +/// > cargo test --target=wasm32-wasi +/// ``` +use crate::mmm::FusedKerSpec; +use crate::mmm::ImplementationQuality; +use crate::{Ops, Scaler}; + +pub fn plug(ops: &mut Ops) { + ops.mmm_impls.push(wasm_f32_4x4.mmm()); + ops.mmm_impls.push(wasm_f32_4x1.mmm()); + ops.mmm_impls.push(wasm_f32_8x1.mmm()); + ops.mmm_impls.push(wasm_f32_8x4.mmm()); + ops.mmm_impls.push(wasm_f32_16x1.mmm()); + ops.mmm_impls.push(wasm_f32_8x8.mmm()); + // Selection: max(nr*mr) for N>1, max(mr) for N=1. + // - N>1 ops: 8x8 (nr*mr=64) wins over 8x4 (32) and 4x4 (16) + // - N=1 ops: 16x1 (mr=16) wins + ops.mmm_f32 = Box::new(|_m, _k, _n| wasm_f32_8x8.mmm()); + ops.mmv_f32 = Box::new(|m, _k| { + match m.unwrap_or(0) { + 0..=7 => wasm_f32_4x1.mmm(), + 8..=15 => wasm_f32_8x1.mmm(), + _ => wasm_f32_16x1.mmm(), + } + }); +} + +unsafe fn kernel_f32_4x4(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + // Each of these variables stores a row of the matrix, + // consisting of four packed `f32` numbers. + let mut ab0 = f32x4_splat(0.0); + let mut ab1 = f32x4_splat(0.0); + let mut ab2 = f32x4_splat(0.0); + let mut ab3 = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + let a = f32x4_splat(0.0); + ab0 = a; + ab1 = a; + ab2 = a; + ab3 = a; + } + FusedKerSpec::LoadTile(_cols, rows) => { + let rows = rows as *const v128; + ab0 = *rows; + ab1 = *rows.add(1); + ab2 = *rows.add(2); + ab3 = *rows.add(3); + } + FusedKerSpec::ScalarMin(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_min(a, ab0); + ab1 = f32x4_min(a, ab1); + ab2 = f32x4_min(a, ab2); + ab3 = f32x4_min(a, ab3); + } + FusedKerSpec::ScalarMax(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_max(a, ab0); + ab1 = f32x4_max(a, ab1); + ab2 = f32x4_max(a, ab2); + ab3 = f32x4_max(a, ab3); + } + FusedKerSpec::ScalarAdd(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_add(a, ab0); + ab1 = f32x4_add(a, ab1); + ab2 = f32x4_add(a, ab2); + ab3 = f32x4_add(a, ab3); + } + FusedKerSpec::ScalarMul(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_mul(a, ab0); + ab1 = f32x4_mul(a, ab1); + ab2 = f32x4_mul(a, ab2); + ab3 = f32x4_mul(a, ab3); + } + FusedKerSpec::ScalarSub(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_sub(a, ab0); + ab1 = f32x4_sub(a, ab1); + ab2 = f32x4_sub(a, ab2); + ab3 = f32x4_sub(a, ab3); + } + FusedKerSpec::ScalarSubF(a) => { + let a = f32x4_splat(a); + ab0 = f32x4_sub(ab0, a); + ab1 = f32x4_sub(ab1, a); + ab2 = f32x4_sub(ab2, a); + ab3 = f32x4_sub(ab3, a); + } + FusedKerSpec::LeakyRelu(a) => { + let a = f32x4_splat(a); + let zero = f32x4_splat(0.0); + + let mask0 = f32x4_gt(ab0, zero); + ab0 = v128_bitselect(ab0, f32x4_mul(a, ab0), mask0); + + let mask1 = f32x4_gt(ab1, zero); + ab1 = v128_bitselect(ab1, f32x4_mul(a, ab1), mask1); + + let mask2 = f32x4_gt(ab2, zero); + ab2 = v128_bitselect(ab2, f32x4_mul(a, ab2), mask2); + + let mask3 = f32x4_gt(ab3, zero); + ab3 = v128_bitselect(ab3, f32x4_mul(a, ab3), mask3); + } + FusedKerSpec::PerRowMin(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_min(f32x4_splat(row[0]), ab0); + ab1 = f32x4_min(f32x4_splat(row[1]), ab1); + ab2 = f32x4_min(f32x4_splat(row[2]), ab2); + ab3 = f32x4_min(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowMax(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_max(f32x4_splat(row[0]), ab0); + ab1 = f32x4_max(f32x4_splat(row[1]), ab1); + ab2 = f32x4_max(f32x4_splat(row[2]), ab2); + ab3 = f32x4_max(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowAdd(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_add(f32x4_splat(row[0]), ab0); + ab1 = f32x4_add(f32x4_splat(row[1]), ab1); + ab2 = f32x4_add(f32x4_splat(row[2]), ab2); + ab3 = f32x4_add(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowMul(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_mul(f32x4_splat(row[0]), ab0); + ab1 = f32x4_mul(f32x4_splat(row[1]), ab1); + ab2 = f32x4_mul(f32x4_splat(row[2]), ab2); + ab3 = f32x4_mul(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowSub(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_sub(f32x4_splat(row[0]), ab0); + ab1 = f32x4_sub(f32x4_splat(row[1]), ab1); + ab2 = f32x4_sub(f32x4_splat(row[2]), ab2); + ab3 = f32x4_sub(f32x4_splat(row[3]), ab3); + } + FusedKerSpec::PerRowSubF(row) => { + let row = std::slice::from_raw_parts(row, 4); + ab0 = f32x4_sub(ab0, f32x4_splat(row[0])); + ab1 = f32x4_sub(ab1, f32x4_splat(row[1])); + ab2 = f32x4_sub(ab2, f32x4_splat(row[2])); + ab3 = f32x4_sub(ab3, f32x4_splat(row[3])); + } + FusedKerSpec::PerColMin(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_min(cols, ab0); + ab1 = f32x4_min(cols, ab1); + ab2 = f32x4_min(cols, ab2); + ab3 = f32x4_min(cols, ab3); + } + FusedKerSpec::PerColMax(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_max(cols, ab0); + ab1 = f32x4_max(cols, ab1); + ab2 = f32x4_max(cols, ab2); + ab3 = f32x4_max(cols, ab3); + } + FusedKerSpec::PerColAdd(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_add(cols, ab0); + ab1 = f32x4_add(cols, ab1); + ab2 = f32x4_add(cols, ab2); + ab3 = f32x4_add(cols, ab3); + } + FusedKerSpec::PerColMul(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_mul(cols, ab0); + ab1 = f32x4_mul(cols, ab1); + ab2 = f32x4_mul(cols, ab2); + ab3 = f32x4_mul(cols, ab3); + } + FusedKerSpec::PerColSub(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_sub(cols, ab0); + ab1 = f32x4_sub(cols, ab1); + ab2 = f32x4_sub(cols, ab2); + ab3 = f32x4_sub(cols, ab3); + } + FusedKerSpec::PerColSubF(cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_sub(ab0, cols); + ab1 = f32x4_sub(ab1, cols); + ab2 = f32x4_sub(ab2, cols); + ab3 = f32x4_sub(ab3, cols); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + let scale = f32x4_splat(scaler.scale); + ab0 = f32x4_mul(scale, ab0); + ab1 = f32x4_mul(scale, ab1); + ab2 = f32x4_mul(scale, ab2); + ab3 = f32x4_mul(scale, ab3); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let shift = f32x4_splat(2f32.powi(-(shift as i32))); + ab0 = f32x4_mul(shift, ab0); + ab1 = f32x4_mul(shift, ab1); + ab2 = f32x4_mul(shift, ab2); + ab3 = f32x4_mul(shift, ab3); + } + FusedKerSpec::ShiftLeft(shift) => { + let shift = f32x4_splat(2f32.powi(shift as i32)); + ab0 = f32x4_mul(shift, ab0); + ab1 = f32x4_mul(shift, ab1); + ab2 = f32x4_mul(shift, ab2); + ab3 = f32x4_mul(shift, ab3); + } + FusedKerSpec::AddUnicast(tile) => { + let mut ptr: *const u8 = tile.ptr; + + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + ab0 = f32x4_add(ab0, f32x4(m0, m1, m2, m3)); + ptr = ptr.add(tile.row_byte_stride as usize); + + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + ab1 = f32x4_add(ab1, f32x4(m0, m1, m2, m3)); + ptr = ptr.add(tile.row_byte_stride as usize); + + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + ab2 = f32x4_add(ab2, f32x4(m0, m1, m2, m3)); + ptr = ptr.add(tile.row_byte_stride as usize); + + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + ab3 = f32x4_add(ab3, f32x4(m0, m1, m2, m3)); + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + let cols = v128_load(cols as *const v128); + ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(*rows.add(0)), cols)); + ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(*rows.add(1)), cols)); + ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(*rows.add(2)), cols)); + ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(*rows.add(3)), cols)); + } + FusedKerSpec::Store(tile) => { + let mut ptr: *mut u8 = tile.ptr; + + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab0); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab0); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = f32x4_extract_lane::<2>(ab0); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = f32x4_extract_lane::<3>(ab0); + ptr = ptr.add(tile.row_byte_stride as usize); + + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab1); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab1); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = f32x4_extract_lane::<2>(ab1); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = f32x4_extract_lane::<3>(ab1); + ptr = ptr.add(tile.row_byte_stride as usize); + + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab2); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab2); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = f32x4_extract_lane::<2>(ab2); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = f32x4_extract_lane::<3>(ab2); + ptr = ptr.add(tile.row_byte_stride as usize); + + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab3); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab3); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = f32x4_extract_lane::<2>(ab3); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = f32x4_extract_lane::<3>(ab3); + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + let a = pa as *const f32; + let b = pb as *const v128; + for i in 0..k { + let a = std::slice::from_raw_parts(a.offset(4 * i as isize), 4); + let b = v128_load(b.offset(i as isize)); + ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(a[0]), b)); + ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(a[1]), b)); + ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(a[2]), b)); + ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(a[3]), b)); + } + } + } + pnl = pnl.add(1); + } + 0 +} + +MMMRustKernel!(kernel_f32_4x4 => wasm_f32_4x4(4,4)@(4,4) quality(ImplementationQuality::TargetOptimized)); + +/// WASM SIMD f32 4x1 kernel — GEMV-shaped variant for matrix-vector products +/// (single-column outputs, e.g., streaming-RNN inference where each frame's +/// activation is a single column). Mirrors the 4x4 kernel's FusedKerSpec +/// match arms but collapses the column dimension from 4 to 1: a single +/// f32x4 accumulator holds 4 output rows × 1 output column packed as +/// [ab[0], ab[1], ab[2], ab[3]]. +/// +/// Selection: tract-core's einsum kernel_selection::strategize() prefers +/// kernels with nr() == 1 when op.n.is_one(), so this kernel is +/// automatically picked for N=1 cases once registered. +unsafe fn kernel_f32_4x1(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + // Single accumulator: 4 rows × 1 col, packed into one f32x4. + // lane[i] holds ab[i] = the output value for row i (col 0). + let mut ab = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + ab = f32x4_splat(0.0); + } + FusedKerSpec::LoadTile(_cols, rows) => { + // Tile is 4 rows × 1 col = 4 contiguous f32s = 1 v128 + ab = v128_load(rows as *const v128); + } + FusedKerSpec::ScalarMin(a) => { + ab = f32x4_min(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarMax(a) => { + ab = f32x4_max(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarAdd(a) => { + ab = f32x4_add(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarMul(a) => { + ab = f32x4_mul(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarSub(a) => { + ab = f32x4_sub(f32x4_splat(a), ab); + } + FusedKerSpec::ScalarSubF(a) => { + ab = f32x4_sub(ab, f32x4_splat(a)); + } + FusedKerSpec::LeakyRelu(a) => { + let zero = f32x4_splat(0.0); + let mask = f32x4_gt(ab, zero); + ab = v128_bitselect(ab, f32x4_mul(f32x4_splat(a), ab), mask); + } + FusedKerSpec::PerRowMin(row) => { + // 4 row values, applied to ab's 4 lanes in order + let r = v128_load(row as *const v128); + ab = f32x4_min(r, ab); + } + FusedKerSpec::PerRowMax(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_max(r, ab); + } + FusedKerSpec::PerRowAdd(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_add(r, ab); + } + FusedKerSpec::PerRowMul(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_mul(r, ab); + } + FusedKerSpec::PerRowSub(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_sub(r, ab); + } + FusedKerSpec::PerRowSubF(row) => { + let r = v128_load(row as *const v128); + ab = f32x4_sub(ab, r); + } + FusedKerSpec::PerColMin(cols) => { + // Single col value broadcast to all 4 rows + ab = f32x4_min(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColMax(cols) => { + ab = f32x4_max(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColAdd(cols) => { + ab = f32x4_add(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColMul(cols) => { + ab = f32x4_mul(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColSub(cols) => { + ab = f32x4_sub(f32x4_splat(*cols), ab); + } + FusedKerSpec::PerColSubF(cols) => { + ab = f32x4_sub(ab, f32x4_splat(*cols)); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + ab = f32x4_mul(f32x4_splat(scaler.scale), ab); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let s = f32x4_splat(2f32.powi(-(shift as i32))); + ab = f32x4_mul(s, ab); + } + FusedKerSpec::ShiftLeft(shift) => { + let s = f32x4_splat(2f32.powi(shift as i32)); + ab = f32x4_mul(s, ab); + } + FusedKerSpec::AddUnicast(tile) => { + // 4 rows × 1 col, with row_byte_stride between rows (col_stride irrelevant for N=1) + let mut ptr: *const u8 = tile.ptr; + let m0 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m1 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m2 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m3 = *(ptr as *const f32); + ab = f32x4_add(ab, f32x4(m0, m1, m2, m3)); + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + // ab[i] += rows[i] * cols[0] (cols[0] is the single col) + let r = v128_load(rows as *const v128); + let c = f32x4_splat(*cols); + ab = f32x4_add(ab, f32x4_mul(r, c)); + } + FusedKerSpec::Store(tile) => { + // 4 rows × 1 col, write each lane to a separate row + let mut ptr: *mut u8 = tile.ptr; + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab); + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + // A is packed [k][MR=4]: each k iter loads 4 contiguous f32s = 1 v128. + // B is packed [k][NR=1]: each k iter loads 1 scalar f32, broadcast. + // ab[i] += a[i] * b for all i in 0..4 → SIMD: ab += a_vec * b_splat + let a = pa as *const v128; + let b = pb as *const f32; + for i in 0..k { + let a_vec = v128_load(a.offset(i as isize)); + let b_splat = f32x4_splat(*b.offset(i as isize)); + ab = f32x4_add(ab, f32x4_mul(a_vec, b_splat)); + } + } + } + pnl = pnl.add(1); + } + 0 +} + +MMMRustKernel!(kernel_f32_4x1 => wasm_f32_4x1(4,1)@(4,1) quality(ImplementationQuality::TargetOptimized)); + +/// WASM SIMD f32 8x1 kernel — wider GEMV variant for matrix-vector products +/// on large M. Uses TWO independent f32x4 accumulators (rows 0-3 in ab_top, +/// rows 4-7 in ab_bot), enabling 2-way ILP within each k-iteration: +/// the inner loop issues two independent f32x4_add(f32x4_mul(...)) ops per +/// k-step, breaking the data-dependency chain depth from K to ~K/2 at the +/// hardware pipeline level. +/// +/// Compared to wasm_f32_4x1 (1 accumulator, k-serial dep chain), this is +/// targeted at GEMV ops where M is a multiple of 8 (or close to it). For +/// M=256 GRU gate matmuls (the dominant GEMV in DFN3), this should yield +/// ~2x speedup on the inner loop on hardware where SIMD FMLA throughput +/// exceeds 1 op/cycle. +/// +/// Selection: `kernel_selection::strategize()` prefers max mr() for n=1 +/// cases, so this kernel automatically wins over wasm_f32_4x1 for all N=1 +/// ops once registered (including small-M cases where it slightly wastes +/// rows — for M=1 lsnr_fc-style ops, that's 7-of-8 row waste, but those +/// ops are <1% of frame so the regression is noise). +unsafe fn kernel_f32_8x1(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + // Two accumulators: 8 rows × 1 col packed as [ab_top, ab_bot] + // ab_top.lane[i] holds row i (i in 0..4); ab_bot.lane[i] holds row i+4 + let mut ab_top = f32x4_splat(0.0); + let mut ab_bot = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + ab_top = f32x4_splat(0.0); + ab_bot = f32x4_splat(0.0); + } + FusedKerSpec::LoadTile(_cols, rows) => { + // 8 rows × 1 col = 8 contiguous f32 = 2 v128 + let p = rows as *const v128; + ab_top = *p; + ab_bot = *p.add(1); + } + FusedKerSpec::ScalarMin(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_min(s, ab_top); + ab_bot = f32x4_min(s, ab_bot); + } + FusedKerSpec::ScalarMax(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_max(s, ab_top); + ab_bot = f32x4_max(s, ab_bot); + } + FusedKerSpec::ScalarAdd(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_add(s, ab_top); + ab_bot = f32x4_add(s, ab_bot); + } + FusedKerSpec::ScalarMul(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_mul(s, ab_top); + ab_bot = f32x4_mul(s, ab_bot); + } + FusedKerSpec::ScalarSub(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_sub(s, ab_top); + ab_bot = f32x4_sub(s, ab_bot); + } + FusedKerSpec::ScalarSubF(a) => { + let s = f32x4_splat(a); + ab_top = f32x4_sub(ab_top, s); + ab_bot = f32x4_sub(ab_bot, s); + } + FusedKerSpec::LeakyRelu(a) => { + let s = f32x4_splat(a); + let zero = f32x4_splat(0.0); + let mask_t = f32x4_gt(ab_top, zero); + let mask_b = f32x4_gt(ab_bot, zero); + ab_top = v128_bitselect(ab_top, f32x4_mul(s, ab_top), mask_t); + ab_bot = v128_bitselect(ab_bot, f32x4_mul(s, ab_bot), mask_b); + } + FusedKerSpec::PerRowMin(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_min(r_t, ab_top); + ab_bot = f32x4_min(r_b, ab_bot); + } + FusedKerSpec::PerRowMax(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_max(r_t, ab_top); + ab_bot = f32x4_max(r_b, ab_bot); + } + FusedKerSpec::PerRowAdd(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_add(r_t, ab_top); + ab_bot = f32x4_add(r_b, ab_bot); + } + FusedKerSpec::PerRowMul(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_mul(r_t, ab_top); + ab_bot = f32x4_mul(r_b, ab_bot); + } + FusedKerSpec::PerRowSub(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_sub(r_t, ab_top); + ab_bot = f32x4_sub(r_b, ab_bot); + } + FusedKerSpec::PerRowSubF(row) => { + let p = row as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + ab_top = f32x4_sub(ab_top, r_t); + ab_bot = f32x4_sub(ab_bot, r_b); + } + FusedKerSpec::PerColMin(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_min(c, ab_top); + ab_bot = f32x4_min(c, ab_bot); + } + FusedKerSpec::PerColMax(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_max(c, ab_top); + ab_bot = f32x4_max(c, ab_bot); + } + FusedKerSpec::PerColAdd(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_add(c, ab_top); + ab_bot = f32x4_add(c, ab_bot); + } + FusedKerSpec::PerColMul(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_mul(c, ab_top); + ab_bot = f32x4_mul(c, ab_bot); + } + FusedKerSpec::PerColSub(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_sub(c, ab_top); + ab_bot = f32x4_sub(c, ab_bot); + } + FusedKerSpec::PerColSubF(cols) => { + let c = f32x4_splat(*cols); + ab_top = f32x4_sub(ab_top, c); + ab_bot = f32x4_sub(ab_bot, c); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + let s = f32x4_splat(scaler.scale); + ab_top = f32x4_mul(s, ab_top); + ab_bot = f32x4_mul(s, ab_bot); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let s = f32x4_splat(2f32.powi(-(shift as i32))); + ab_top = f32x4_mul(s, ab_top); + ab_bot = f32x4_mul(s, ab_bot); + } + FusedKerSpec::ShiftLeft(shift) => { + let s = f32x4_splat(2f32.powi(shift as i32)); + ab_top = f32x4_mul(s, ab_top); + ab_bot = f32x4_mul(s, ab_bot); + } + FusedKerSpec::AddUnicast(tile) => { + // 8 rows × 1 col, stride is row_byte_stride between rows + let mut ptr: *const u8 = tile.ptr; + let m0 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m1 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m2 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m3 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m4 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m5 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m6 = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + let m7 = *(ptr as *const f32); + ab_top = f32x4_add(ab_top, f32x4(m0, m1, m2, m3)); + ab_bot = f32x4_add(ab_bot, f32x4(m4, m5, m6, m7)); + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + let p = rows as *const v128; + let r_t = v128_load(p); + let r_b = v128_load(p.add(1)); + let c = f32x4_splat(*cols); + ab_top = f32x4_add(ab_top, f32x4_mul(r_t, c)); + ab_bot = f32x4_add(ab_bot, f32x4_mul(r_b, c)); + } + FusedKerSpec::Store(tile) => { + // 8 rows × 1 col, write each lane to a separate row + let mut ptr: *mut u8 = tile.ptr; + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab_top); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab_top); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab_top); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab_top); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab_bot); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab_bot); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab_bot); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab_bot); + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + // A: packed [k][MR=8] = each k iter loads 8 f32 = 2 v128 + // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast + // The two fmadd ops on (ab_top, ab_bot) are independent — 2-way ILP per iter. + let a = pa as *const v128; + let b = pb as *const f32; + for i in 0..k { + let a_t = v128_load(a.offset((2 * i) as isize)); + let a_b = v128_load(a.offset((2 * i + 1) as isize)); + let b_splat = f32x4_splat(*b.offset(i as isize)); + ab_top = f32x4_add(ab_top, f32x4_mul(a_t, b_splat)); + ab_bot = f32x4_add(ab_bot, f32x4_mul(a_b, b_splat)); + } + } + } + pnl = pnl.add(1); + } + 0 +} + +MMMRustKernel!(kernel_f32_8x1 => wasm_f32_8x1(8,1)@(8,1) quality(ImplementationQuality::TargetOptimized)); + +/// WASM SIMD f32 8x4 kernel — wider MM tile for matrix-matrix products +/// where N>1 (i.e., N>=4 effectively, since the strategizer picks this for +/// any non-N=1 case). Processes 8 rows × 4 cols per tile using 8 f32x4 +/// accumulators (one per row, holding 4 cols each). +/// +/// Compared to the existing wasm_f32_4x4 (4 accums, mr*nr=16), this has +/// 2× the rows per tile (mr*nr=32) so: +/// - Half as many tile iterations needed to cover the M dimension +/// - 8 independent fmadds per k-iter (vs 4) — better SIMD pipeline saturation +/// - Same column-tile width (4), no waste vs 4x4 for N>1 cases +/// +/// Selection: kernel_selection::strategize() prefers max(nr*mr) for N>1, +/// so this wins over 4x4 (32 > 16) for any non-GEMV op. +unsafe fn kernel_f32_8x4(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + // 8 accumulators: 8 rows × 4 cols, packed as 8 f32x4 + let mut ab0 = f32x4_splat(0.0); + let mut ab1 = f32x4_splat(0.0); + let mut ab2 = f32x4_splat(0.0); + let mut ab3 = f32x4_splat(0.0); + let mut ab4 = f32x4_splat(0.0); + let mut ab5 = f32x4_splat(0.0); + let mut ab6 = f32x4_splat(0.0); + let mut ab7 = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + let z = f32x4_splat(0.0); + ab0 = z; ab1 = z; ab2 = z; ab3 = z; + ab4 = z; ab5 = z; ab6 = z; ab7 = z; + } + FusedKerSpec::LoadTile(_cols, rows) => { + let p = rows as *const v128; + ab0 = *p; + ab1 = *p.add(1); + ab2 = *p.add(2); + ab3 = *p.add(3); + ab4 = *p.add(4); + ab5 = *p.add(5); + ab6 = *p.add(6); + ab7 = *p.add(7); + } + FusedKerSpec::ScalarMin(a) => { + let s = f32x4_splat(a); + ab0 = f32x4_min(s, ab0); ab1 = f32x4_min(s, ab1); + ab2 = f32x4_min(s, ab2); ab3 = f32x4_min(s, ab3); + ab4 = f32x4_min(s, ab4); ab5 = f32x4_min(s, ab5); + ab6 = f32x4_min(s, ab6); ab7 = f32x4_min(s, ab7); + } + FusedKerSpec::ScalarMax(a) => { + let s = f32x4_splat(a); + ab0 = f32x4_max(s, ab0); ab1 = f32x4_max(s, ab1); + ab2 = f32x4_max(s, ab2); ab3 = f32x4_max(s, ab3); + ab4 = f32x4_max(s, ab4); ab5 = f32x4_max(s, ab5); + ab6 = f32x4_max(s, ab6); ab7 = f32x4_max(s, ab7); + } + FusedKerSpec::ScalarAdd(a) => { + let s = f32x4_splat(a); + ab0 = f32x4_add(s, ab0); ab1 = f32x4_add(s, ab1); + ab2 = f32x4_add(s, ab2); ab3 = f32x4_add(s, ab3); + ab4 = f32x4_add(s, ab4); ab5 = f32x4_add(s, ab5); + ab6 = f32x4_add(s, ab6); ab7 = f32x4_add(s, ab7); + } + FusedKerSpec::ScalarMul(a) => { + let s = f32x4_splat(a); + ab0 = f32x4_mul(s, ab0); ab1 = f32x4_mul(s, ab1); + ab2 = f32x4_mul(s, ab2); ab3 = f32x4_mul(s, ab3); + ab4 = f32x4_mul(s, ab4); ab5 = f32x4_mul(s, ab5); + ab6 = f32x4_mul(s, ab6); ab7 = f32x4_mul(s, ab7); + } + FusedKerSpec::ScalarSub(a) => { + let s = f32x4_splat(a); + ab0 = f32x4_sub(s, ab0); ab1 = f32x4_sub(s, ab1); + ab2 = f32x4_sub(s, ab2); ab3 = f32x4_sub(s, ab3); + ab4 = f32x4_sub(s, ab4); ab5 = f32x4_sub(s, ab5); + ab6 = f32x4_sub(s, ab6); ab7 = f32x4_sub(s, ab7); + } + FusedKerSpec::ScalarSubF(a) => { + let s = f32x4_splat(a); + ab0 = f32x4_sub(ab0, s); ab1 = f32x4_sub(ab1, s); + ab2 = f32x4_sub(ab2, s); ab3 = f32x4_sub(ab3, s); + ab4 = f32x4_sub(ab4, s); ab5 = f32x4_sub(ab5, s); + ab6 = f32x4_sub(ab6, s); ab7 = f32x4_sub(ab7, s); + } + FusedKerSpec::LeakyRelu(a) => { + let s = f32x4_splat(a); + let zero = f32x4_splat(0.0); + let m0 = f32x4_gt(ab0, zero); ab0 = v128_bitselect(ab0, f32x4_mul(s, ab0), m0); + let m1 = f32x4_gt(ab1, zero); ab1 = v128_bitselect(ab1, f32x4_mul(s, ab1), m1); + let m2 = f32x4_gt(ab2, zero); ab2 = v128_bitselect(ab2, f32x4_mul(s, ab2), m2); + let m3 = f32x4_gt(ab3, zero); ab3 = v128_bitselect(ab3, f32x4_mul(s, ab3), m3); + let m4 = f32x4_gt(ab4, zero); ab4 = v128_bitselect(ab4, f32x4_mul(s, ab4), m4); + let m5 = f32x4_gt(ab5, zero); ab5 = v128_bitselect(ab5, f32x4_mul(s, ab5), m5); + let m6 = f32x4_gt(ab6, zero); ab6 = v128_bitselect(ab6, f32x4_mul(s, ab6), m6); + let m7 = f32x4_gt(ab7, zero); ab7 = v128_bitselect(ab7, f32x4_mul(s, ab7), m7); + } + FusedKerSpec::PerRowMin(row) => { + let r = std::slice::from_raw_parts(row, 8); + ab0 = f32x4_min(f32x4_splat(r[0]), ab0); ab1 = f32x4_min(f32x4_splat(r[1]), ab1); + ab2 = f32x4_min(f32x4_splat(r[2]), ab2); ab3 = f32x4_min(f32x4_splat(r[3]), ab3); + ab4 = f32x4_min(f32x4_splat(r[4]), ab4); ab5 = f32x4_min(f32x4_splat(r[5]), ab5); + ab6 = f32x4_min(f32x4_splat(r[6]), ab6); ab7 = f32x4_min(f32x4_splat(r[7]), ab7); + } + FusedKerSpec::PerRowMax(row) => { + let r = std::slice::from_raw_parts(row, 8); + ab0 = f32x4_max(f32x4_splat(r[0]), ab0); ab1 = f32x4_max(f32x4_splat(r[1]), ab1); + ab2 = f32x4_max(f32x4_splat(r[2]), ab2); ab3 = f32x4_max(f32x4_splat(r[3]), ab3); + ab4 = f32x4_max(f32x4_splat(r[4]), ab4); ab5 = f32x4_max(f32x4_splat(r[5]), ab5); + ab6 = f32x4_max(f32x4_splat(r[6]), ab6); ab7 = f32x4_max(f32x4_splat(r[7]), ab7); + } + FusedKerSpec::PerRowAdd(row) => { + let r = std::slice::from_raw_parts(row, 8); + ab0 = f32x4_add(f32x4_splat(r[0]), ab0); ab1 = f32x4_add(f32x4_splat(r[1]), ab1); + ab2 = f32x4_add(f32x4_splat(r[2]), ab2); ab3 = f32x4_add(f32x4_splat(r[3]), ab3); + ab4 = f32x4_add(f32x4_splat(r[4]), ab4); ab5 = f32x4_add(f32x4_splat(r[5]), ab5); + ab6 = f32x4_add(f32x4_splat(r[6]), ab6); ab7 = f32x4_add(f32x4_splat(r[7]), ab7); + } + FusedKerSpec::PerRowMul(row) => { + let r = std::slice::from_raw_parts(row, 8); + ab0 = f32x4_mul(f32x4_splat(r[0]), ab0); ab1 = f32x4_mul(f32x4_splat(r[1]), ab1); + ab2 = f32x4_mul(f32x4_splat(r[2]), ab2); ab3 = f32x4_mul(f32x4_splat(r[3]), ab3); + ab4 = f32x4_mul(f32x4_splat(r[4]), ab4); ab5 = f32x4_mul(f32x4_splat(r[5]), ab5); + ab6 = f32x4_mul(f32x4_splat(r[6]), ab6); ab7 = f32x4_mul(f32x4_splat(r[7]), ab7); + } + FusedKerSpec::PerRowSub(row) => { + let r = std::slice::from_raw_parts(row, 8); + ab0 = f32x4_sub(f32x4_splat(r[0]), ab0); ab1 = f32x4_sub(f32x4_splat(r[1]), ab1); + ab2 = f32x4_sub(f32x4_splat(r[2]), ab2); ab3 = f32x4_sub(f32x4_splat(r[3]), ab3); + ab4 = f32x4_sub(f32x4_splat(r[4]), ab4); ab5 = f32x4_sub(f32x4_splat(r[5]), ab5); + ab6 = f32x4_sub(f32x4_splat(r[6]), ab6); ab7 = f32x4_sub(f32x4_splat(r[7]), ab7); + } + FusedKerSpec::PerRowSubF(row) => { + let r = std::slice::from_raw_parts(row, 8); + ab0 = f32x4_sub(ab0, f32x4_splat(r[0])); ab1 = f32x4_sub(ab1, f32x4_splat(r[1])); + ab2 = f32x4_sub(ab2, f32x4_splat(r[2])); ab3 = f32x4_sub(ab3, f32x4_splat(r[3])); + ab4 = f32x4_sub(ab4, f32x4_splat(r[4])); ab5 = f32x4_sub(ab5, f32x4_splat(r[5])); + ab6 = f32x4_sub(ab6, f32x4_splat(r[6])); ab7 = f32x4_sub(ab7, f32x4_splat(r[7])); + } + FusedKerSpec::PerColMin(cols) => { + let c = v128_load(cols as *const v128); + ab0 = f32x4_min(c, ab0); ab1 = f32x4_min(c, ab1); + ab2 = f32x4_min(c, ab2); ab3 = f32x4_min(c, ab3); + ab4 = f32x4_min(c, ab4); ab5 = f32x4_min(c, ab5); + ab6 = f32x4_min(c, ab6); ab7 = f32x4_min(c, ab7); + } + FusedKerSpec::PerColMax(cols) => { + let c = v128_load(cols as *const v128); + ab0 = f32x4_max(c, ab0); ab1 = f32x4_max(c, ab1); + ab2 = f32x4_max(c, ab2); ab3 = f32x4_max(c, ab3); + ab4 = f32x4_max(c, ab4); ab5 = f32x4_max(c, ab5); + ab6 = f32x4_max(c, ab6); ab7 = f32x4_max(c, ab7); + } + FusedKerSpec::PerColAdd(cols) => { + let c = v128_load(cols as *const v128); + ab0 = f32x4_add(c, ab0); ab1 = f32x4_add(c, ab1); + ab2 = f32x4_add(c, ab2); ab3 = f32x4_add(c, ab3); + ab4 = f32x4_add(c, ab4); ab5 = f32x4_add(c, ab5); + ab6 = f32x4_add(c, ab6); ab7 = f32x4_add(c, ab7); + } + FusedKerSpec::PerColMul(cols) => { + let c = v128_load(cols as *const v128); + ab0 = f32x4_mul(c, ab0); ab1 = f32x4_mul(c, ab1); + ab2 = f32x4_mul(c, ab2); ab3 = f32x4_mul(c, ab3); + ab4 = f32x4_mul(c, ab4); ab5 = f32x4_mul(c, ab5); + ab6 = f32x4_mul(c, ab6); ab7 = f32x4_mul(c, ab7); + } + FusedKerSpec::PerColSub(cols) => { + let c = v128_load(cols as *const v128); + ab0 = f32x4_sub(c, ab0); ab1 = f32x4_sub(c, ab1); + ab2 = f32x4_sub(c, ab2); ab3 = f32x4_sub(c, ab3); + ab4 = f32x4_sub(c, ab4); ab5 = f32x4_sub(c, ab5); + ab6 = f32x4_sub(c, ab6); ab7 = f32x4_sub(c, ab7); + } + FusedKerSpec::PerColSubF(cols) => { + let c = v128_load(cols as *const v128); + ab0 = f32x4_sub(ab0, c); ab1 = f32x4_sub(ab1, c); + ab2 = f32x4_sub(ab2, c); ab3 = f32x4_sub(ab3, c); + ab4 = f32x4_sub(ab4, c); ab5 = f32x4_sub(ab5, c); + ab6 = f32x4_sub(ab6, c); ab7 = f32x4_sub(ab7, c); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + let s = f32x4_splat(scaler.scale); + ab0 = f32x4_mul(s, ab0); ab1 = f32x4_mul(s, ab1); + ab2 = f32x4_mul(s, ab2); ab3 = f32x4_mul(s, ab3); + ab4 = f32x4_mul(s, ab4); ab5 = f32x4_mul(s, ab5); + ab6 = f32x4_mul(s, ab6); ab7 = f32x4_mul(s, ab7); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let s = f32x4_splat(2f32.powi(-(shift as i32))); + ab0 = f32x4_mul(s, ab0); ab1 = f32x4_mul(s, ab1); + ab2 = f32x4_mul(s, ab2); ab3 = f32x4_mul(s, ab3); + ab4 = f32x4_mul(s, ab4); ab5 = f32x4_mul(s, ab5); + ab6 = f32x4_mul(s, ab6); ab7 = f32x4_mul(s, ab7); + } + FusedKerSpec::ShiftLeft(shift) => { + let s = f32x4_splat(2f32.powi(shift as i32)); + ab0 = f32x4_mul(s, ab0); ab1 = f32x4_mul(s, ab1); + ab2 = f32x4_mul(s, ab2); ab3 = f32x4_mul(s, ab3); + ab4 = f32x4_mul(s, ab4); ab5 = f32x4_mul(s, ab5); + ab6 = f32x4_mul(s, ab6); ab7 = f32x4_mul(s, ab7); + } + FusedKerSpec::AddUnicast(tile) => { + // 8 rows × 4 cols, with col_byte_stride and row_byte_stride + let mut ptr: *const u8 = tile.ptr; + for ab_ref in [&mut ab0, &mut ab1, &mut ab2, &mut ab3, + &mut ab4, &mut ab5, &mut ab6, &mut ab7].iter_mut() { + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + **ab_ref = f32x4_add(**ab_ref, f32x4(m0, m1, m2, m3)); + ptr = ptr.add(tile.row_byte_stride as usize); + } + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + let c = v128_load(cols as *const v128); + ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(*rows.add(0)), c)); + ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(*rows.add(1)), c)); + ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(*rows.add(2)), c)); + ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(*rows.add(3)), c)); + ab4 = f32x4_add(ab4, f32x4_mul(f32x4_splat(*rows.add(4)), c)); + ab5 = f32x4_add(ab5, f32x4_mul(f32x4_splat(*rows.add(5)), c)); + ab6 = f32x4_add(ab6, f32x4_mul(f32x4_splat(*rows.add(6)), c)); + ab7 = f32x4_add(ab7, f32x4_mul(f32x4_splat(*rows.add(7)), c)); + } + FusedKerSpec::Store(tile) => { + // 8 rows × 4 cols stores + let mut ptr: *mut u8 = tile.ptr; + for ab in [ab0, ab1, ab2, ab3, ab4, ab5, ab6, ab7].iter() { + *(ptr as *mut f32) = f32x4_extract_lane::<0>(*ab); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(*ab); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = f32x4_extract_lane::<2>(*ab); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = f32x4_extract_lane::<3>(*ab); + ptr = ptr.add(tile.row_byte_stride as usize); + } + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + // A: packed [k][MR=8] = each k iter loads 8 row values + // B: packed [k][NR=4] = each k iter loads 4 col values as 1 v128 + let a = pa as *const f32; + let b = pb as *const v128; + for i in 0..k { + let arow = std::slice::from_raw_parts(a.offset(8 * i as isize), 8); + let bvec = v128_load(b.offset(i as isize)); + ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(arow[0]), bvec)); + ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(arow[1]), bvec)); + ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(arow[2]), bvec)); + ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(arow[3]), bvec)); + ab4 = f32x4_add(ab4, f32x4_mul(f32x4_splat(arow[4]), bvec)); + ab5 = f32x4_add(ab5, f32x4_mul(f32x4_splat(arow[5]), bvec)); + ab6 = f32x4_add(ab6, f32x4_mul(f32x4_splat(arow[6]), bvec)); + ab7 = f32x4_add(ab7, f32x4_mul(f32x4_splat(arow[7]), bvec)); + } + } + } + pnl = pnl.add(1); + } + 0 +} + +MMMRustKernel!(kernel_f32_8x4 => wasm_f32_8x4(8,4)@(8,4) quality(ImplementationQuality::TargetOptimized)); + +/// WASM SIMD f32 16x1 kernel — wider GEMV variant for matrix-vector products +/// on very large M. Uses FOUR independent f32x4 accumulators (rows 0-3, +/// 4-7, 8-11, 12-15), enabling 4-way ILP within each k-iteration. +/// +/// Compared to wasm_f32_8x1 (2 accumulators, 2-way ILP), this exposes more +/// parallel work to the SIMD pipelines, beneficial on hardware with 3+ +/// SIMD execution units (most modern ARM and x86). +unsafe fn kernel_f32_16x1(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + // Four accumulators: 16 rows × 1 col packed as [ab_q0, ab_q1, ab_q2, ab_q3] + // ab_q0 = rows 0-3, ab_q1 = rows 4-7, ab_q2 = rows 8-11, ab_q3 = rows 12-15 + let mut ab_q0 = f32x4_splat(0.0); + let mut ab_q1 = f32x4_splat(0.0); + let mut ab_q2 = f32x4_splat(0.0); + let mut ab_q3 = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + let z = f32x4_splat(0.0); + ab_q0 = z; ab_q1 = z; ab_q2 = z; ab_q3 = z; + } + FusedKerSpec::LoadTile(_cols, rows) => { + let p = rows as *const v128; + ab_q0 = *p; + ab_q1 = *p.add(1); + ab_q2 = *p.add(2); + ab_q3 = *p.add(3); + } + FusedKerSpec::ScalarMin(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_min(s, ab_q0); ab_q1 = f32x4_min(s, ab_q1); + ab_q2 = f32x4_min(s, ab_q2); ab_q3 = f32x4_min(s, ab_q3); + } + FusedKerSpec::ScalarMax(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_max(s, ab_q0); ab_q1 = f32x4_max(s, ab_q1); + ab_q2 = f32x4_max(s, ab_q2); ab_q3 = f32x4_max(s, ab_q3); + } + FusedKerSpec::ScalarAdd(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_add(s, ab_q0); ab_q1 = f32x4_add(s, ab_q1); + ab_q2 = f32x4_add(s, ab_q2); ab_q3 = f32x4_add(s, ab_q3); + } + FusedKerSpec::ScalarMul(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_mul(s, ab_q0); ab_q1 = f32x4_mul(s, ab_q1); + ab_q2 = f32x4_mul(s, ab_q2); ab_q3 = f32x4_mul(s, ab_q3); + } + FusedKerSpec::ScalarSub(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_sub(s, ab_q0); ab_q1 = f32x4_sub(s, ab_q1); + ab_q2 = f32x4_sub(s, ab_q2); ab_q3 = f32x4_sub(s, ab_q3); + } + FusedKerSpec::ScalarSubF(a) => { + let s = f32x4_splat(a); + ab_q0 = f32x4_sub(ab_q0, s); ab_q1 = f32x4_sub(ab_q1, s); + ab_q2 = f32x4_sub(ab_q2, s); ab_q3 = f32x4_sub(ab_q3, s); + } + FusedKerSpec::LeakyRelu(a) => { + let s = f32x4_splat(a); + let zero = f32x4_splat(0.0); + let m0 = f32x4_gt(ab_q0, zero); ab_q0 = v128_bitselect(ab_q0, f32x4_mul(s, ab_q0), m0); + let m1 = f32x4_gt(ab_q1, zero); ab_q1 = v128_bitselect(ab_q1, f32x4_mul(s, ab_q1), m1); + let m2 = f32x4_gt(ab_q2, zero); ab_q2 = v128_bitselect(ab_q2, f32x4_mul(s, ab_q2), m2); + let m3 = f32x4_gt(ab_q3, zero); ab_q3 = v128_bitselect(ab_q3, f32x4_mul(s, ab_q3), m3); + } + FusedKerSpec::PerRowMin(row) => { + let p = row as *const v128; + ab_q0 = f32x4_min(v128_load(p), ab_q0); + ab_q1 = f32x4_min(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_min(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_min(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowMax(row) => { + let p = row as *const v128; + ab_q0 = f32x4_max(v128_load(p), ab_q0); + ab_q1 = f32x4_max(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_max(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_max(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowAdd(row) => { + let p = row as *const v128; + ab_q0 = f32x4_add(v128_load(p), ab_q0); + ab_q1 = f32x4_add(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_add(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_add(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowMul(row) => { + let p = row as *const v128; + ab_q0 = f32x4_mul(v128_load(p), ab_q0); + ab_q1 = f32x4_mul(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_mul(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_mul(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowSub(row) => { + let p = row as *const v128; + ab_q0 = f32x4_sub(v128_load(p), ab_q0); + ab_q1 = f32x4_sub(v128_load(p.add(1)), ab_q1); + ab_q2 = f32x4_sub(v128_load(p.add(2)), ab_q2); + ab_q3 = f32x4_sub(v128_load(p.add(3)), ab_q3); + } + FusedKerSpec::PerRowSubF(row) => { + let p = row as *const v128; + ab_q0 = f32x4_sub(ab_q0, v128_load(p)); + ab_q1 = f32x4_sub(ab_q1, v128_load(p.add(1))); + ab_q2 = f32x4_sub(ab_q2, v128_load(p.add(2))); + ab_q3 = f32x4_sub(ab_q3, v128_load(p.add(3))); + } + FusedKerSpec::PerColMin(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_min(c, ab_q0); ab_q1 = f32x4_min(c, ab_q1); + ab_q2 = f32x4_min(c, ab_q2); ab_q3 = f32x4_min(c, ab_q3); + } + FusedKerSpec::PerColMax(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_max(c, ab_q0); ab_q1 = f32x4_max(c, ab_q1); + ab_q2 = f32x4_max(c, ab_q2); ab_q3 = f32x4_max(c, ab_q3); + } + FusedKerSpec::PerColAdd(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_add(c, ab_q0); ab_q1 = f32x4_add(c, ab_q1); + ab_q2 = f32x4_add(c, ab_q2); ab_q3 = f32x4_add(c, ab_q3); + } + FusedKerSpec::PerColMul(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_mul(c, ab_q0); ab_q1 = f32x4_mul(c, ab_q1); + ab_q2 = f32x4_mul(c, ab_q2); ab_q3 = f32x4_mul(c, ab_q3); + } + FusedKerSpec::PerColSub(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_sub(c, ab_q0); ab_q1 = f32x4_sub(c, ab_q1); + ab_q2 = f32x4_sub(c, ab_q2); ab_q3 = f32x4_sub(c, ab_q3); + } + FusedKerSpec::PerColSubF(cols) => { + let c = f32x4_splat(*cols); + ab_q0 = f32x4_sub(ab_q0, c); ab_q1 = f32x4_sub(ab_q1, c); + ab_q2 = f32x4_sub(ab_q2, c); ab_q3 = f32x4_sub(ab_q3, c); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + let s = f32x4_splat(scaler.scale); + ab_q0 = f32x4_mul(s, ab_q0); ab_q1 = f32x4_mul(s, ab_q1); + ab_q2 = f32x4_mul(s, ab_q2); ab_q3 = f32x4_mul(s, ab_q3); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let s = f32x4_splat(2f32.powi(-(shift as i32))); + ab_q0 = f32x4_mul(s, ab_q0); ab_q1 = f32x4_mul(s, ab_q1); + ab_q2 = f32x4_mul(s, ab_q2); ab_q3 = f32x4_mul(s, ab_q3); + } + FusedKerSpec::ShiftLeft(shift) => { + let s = f32x4_splat(2f32.powi(shift as i32)); + ab_q0 = f32x4_mul(s, ab_q0); ab_q1 = f32x4_mul(s, ab_q1); + ab_q2 = f32x4_mul(s, ab_q2); ab_q3 = f32x4_mul(s, ab_q3); + } + FusedKerSpec::AddUnicast(tile) => { + // 16 rows × 1 col, with row_byte_stride between rows + let mut ptr: *const u8 = tile.ptr; + let mut ms = [0f32; 16]; + for i in 0..16 { + ms[i] = *(ptr as *const f32); + ptr = ptr.add(tile.row_byte_stride as usize); + } + ab_q0 = f32x4_add(ab_q0, f32x4(ms[0], ms[1], ms[2], ms[3])); + ab_q1 = f32x4_add(ab_q1, f32x4(ms[4], ms[5], ms[6], ms[7])); + ab_q2 = f32x4_add(ab_q2, f32x4(ms[8], ms[9], ms[10], ms[11])); + ab_q3 = f32x4_add(ab_q3, f32x4(ms[12], ms[13], ms[14], ms[15])); + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + let p = rows as *const v128; + let c = f32x4_splat(*cols); + ab_q0 = f32x4_add(ab_q0, f32x4_mul(v128_load(p), c)); + ab_q1 = f32x4_add(ab_q1, f32x4_mul(v128_load(p.add(1)), c)); + ab_q2 = f32x4_add(ab_q2, f32x4_mul(v128_load(p.add(2)), c)); + ab_q3 = f32x4_add(ab_q3, f32x4_mul(v128_load(p.add(3)), c)); + } + FusedKerSpec::Store(tile) => { + // 16 rows × 1 col, write each lane to a separate row + let mut ptr: *mut u8 = tile.ptr; + for ab in [ab_q0, ab_q1, ab_q2, ab_q3].iter() { + *(ptr as *mut f32) = f32x4_extract_lane::<0>(*ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<1>(*ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<2>(*ab); + ptr = ptr.add(tile.row_byte_stride as usize); + *(ptr as *mut f32) = f32x4_extract_lane::<3>(*ab); + ptr = ptr.add(tile.row_byte_stride as usize); + } + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + // A: packed [k][MR=16] = each k iter loads 16 f32 = 4 v128 + // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast + // 4 INDEPENDENT fmadds per k-iter — 4-way ILP + let a = pa as *const v128; + let b = pb as *const f32; + for i in 0..k { + let a0 = v128_load(a.offset((4 * i) as isize)); + let a1 = v128_load(a.offset((4 * i + 1) as isize)); + let a2 = v128_load(a.offset((4 * i + 2) as isize)); + let a3 = v128_load(a.offset((4 * i + 3) as isize)); + let bs = f32x4_splat(*b.offset(i as isize)); + ab_q0 = f32x4_add(ab_q0, f32x4_mul(a0, bs)); + ab_q1 = f32x4_add(ab_q1, f32x4_mul(a1, bs)); + ab_q2 = f32x4_add(ab_q2, f32x4_mul(a2, bs)); + ab_q3 = f32x4_add(ab_q3, f32x4_mul(a3, bs)); + } + } + } + pnl = pnl.add(1); + } + 0 +} + + +MMMRustKernel!(kernel_f32_16x1 => wasm_f32_16x1(16,1)@(16,1) quality(ImplementationQuality::TargetOptimized)); + +/// WASM SIMD f32 8x8 kernel — wide MM tile (8 rows × 8 cols, 16 v128 accumulators). +/// Each row uses 2 v128: cols 0-3 in `_lo`, cols 4-7 in `_hi`. 16 accumulators +/// is at the limit of WASM's 16 logical SIMD register slots; this tests the +/// register-pressure boundary. For DFN3 ops, all M and N are multiples of 8, +/// so 8x8 fits cleanly with no padding waste. +unsafe fn kernel_f32_8x8(mut pnl: *const FusedKerSpec) -> isize { + use std::arch::wasm32::*; + + // 8 rows × 8 cols = 16 f32x4 accumulators (cols 0-3 in _lo, cols 4-7 in _hi) + let mut a0lo = f32x4_splat(0.0); let mut a0hi = f32x4_splat(0.0); + let mut a1lo = f32x4_splat(0.0); let mut a1hi = f32x4_splat(0.0); + let mut a2lo = f32x4_splat(0.0); let mut a2hi = f32x4_splat(0.0); + let mut a3lo = f32x4_splat(0.0); let mut a3hi = f32x4_splat(0.0); + let mut a4lo = f32x4_splat(0.0); let mut a4hi = f32x4_splat(0.0); + let mut a5lo = f32x4_splat(0.0); let mut a5hi = f32x4_splat(0.0); + let mut a6lo = f32x4_splat(0.0); let mut a6hi = f32x4_splat(0.0); + let mut a7lo = f32x4_splat(0.0); let mut a7hi = f32x4_splat(0.0); + + while !pnl.is_null() { + match *pnl { + FusedKerSpec::Done => break, + FusedKerSpec::Clear => { + let z = f32x4_splat(0.0); + a0lo = z; a0hi = z; a1lo = z; a1hi = z; + a2lo = z; a2hi = z; a3lo = z; a3hi = z; + a4lo = z; a4hi = z; a5lo = z; a5hi = z; + a6lo = z; a6hi = z; a7lo = z; a7hi = z; + } + FusedKerSpec::LoadTile(_cols, rows) => { + // 8 rows × 8 cols = 16 v128 (2 per row, contiguous lo+hi) + let p = rows as *const v128; + a0lo = *p.add(0); a0hi = *p.add(1); + a1lo = *p.add(2); a1hi = *p.add(3); + a2lo = *p.add(4); a2hi = *p.add(5); + a3lo = *p.add(6); a3hi = *p.add(7); + a4lo = *p.add(8); a4hi = *p.add(9); + a5lo = *p.add(10); a5hi = *p.add(11); + a6lo = *p.add(12); a6hi = *p.add(13); + a7lo = *p.add(14); a7hi = *p.add(15); + } + FusedKerSpec::ScalarMin(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_min(s, a0lo); a0hi = f32x4_min(s, a0hi); + a1lo = f32x4_min(s, a1lo); a1hi = f32x4_min(s, a1hi); + a2lo = f32x4_min(s, a2lo); a2hi = f32x4_min(s, a2hi); + a3lo = f32x4_min(s, a3lo); a3hi = f32x4_min(s, a3hi); + a4lo = f32x4_min(s, a4lo); a4hi = f32x4_min(s, a4hi); + a5lo = f32x4_min(s, a5lo); a5hi = f32x4_min(s, a5hi); + a6lo = f32x4_min(s, a6lo); a6hi = f32x4_min(s, a6hi); + a7lo = f32x4_min(s, a7lo); a7hi = f32x4_min(s, a7hi); + } + FusedKerSpec::ScalarMax(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_max(s, a0lo); a0hi = f32x4_max(s, a0hi); + a1lo = f32x4_max(s, a1lo); a1hi = f32x4_max(s, a1hi); + a2lo = f32x4_max(s, a2lo); a2hi = f32x4_max(s, a2hi); + a3lo = f32x4_max(s, a3lo); a3hi = f32x4_max(s, a3hi); + a4lo = f32x4_max(s, a4lo); a4hi = f32x4_max(s, a4hi); + a5lo = f32x4_max(s, a5lo); a5hi = f32x4_max(s, a5hi); + a6lo = f32x4_max(s, a6lo); a6hi = f32x4_max(s, a6hi); + a7lo = f32x4_max(s, a7lo); a7hi = f32x4_max(s, a7hi); + } + FusedKerSpec::ScalarAdd(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_add(s, a0lo); a0hi = f32x4_add(s, a0hi); + a1lo = f32x4_add(s, a1lo); a1hi = f32x4_add(s, a1hi); + a2lo = f32x4_add(s, a2lo); a2hi = f32x4_add(s, a2hi); + a3lo = f32x4_add(s, a3lo); a3hi = f32x4_add(s, a3hi); + a4lo = f32x4_add(s, a4lo); a4hi = f32x4_add(s, a4hi); + a5lo = f32x4_add(s, a5lo); a5hi = f32x4_add(s, a5hi); + a6lo = f32x4_add(s, a6lo); a6hi = f32x4_add(s, a6hi); + a7lo = f32x4_add(s, a7lo); a7hi = f32x4_add(s, a7hi); + } + FusedKerSpec::ScalarMul(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_mul(s, a0lo); a0hi = f32x4_mul(s, a0hi); + a1lo = f32x4_mul(s, a1lo); a1hi = f32x4_mul(s, a1hi); + a2lo = f32x4_mul(s, a2lo); a2hi = f32x4_mul(s, a2hi); + a3lo = f32x4_mul(s, a3lo); a3hi = f32x4_mul(s, a3hi); + a4lo = f32x4_mul(s, a4lo); a4hi = f32x4_mul(s, a4hi); + a5lo = f32x4_mul(s, a5lo); a5hi = f32x4_mul(s, a5hi); + a6lo = f32x4_mul(s, a6lo); a6hi = f32x4_mul(s, a6hi); + a7lo = f32x4_mul(s, a7lo); a7hi = f32x4_mul(s, a7hi); + } + FusedKerSpec::ScalarSub(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_sub(s, a0lo); a0hi = f32x4_sub(s, a0hi); + a1lo = f32x4_sub(s, a1lo); a1hi = f32x4_sub(s, a1hi); + a2lo = f32x4_sub(s, a2lo); a2hi = f32x4_sub(s, a2hi); + a3lo = f32x4_sub(s, a3lo); a3hi = f32x4_sub(s, a3hi); + a4lo = f32x4_sub(s, a4lo); a4hi = f32x4_sub(s, a4hi); + a5lo = f32x4_sub(s, a5lo); a5hi = f32x4_sub(s, a5hi); + a6lo = f32x4_sub(s, a6lo); a6hi = f32x4_sub(s, a6hi); + a7lo = f32x4_sub(s, a7lo); a7hi = f32x4_sub(s, a7hi); + } + FusedKerSpec::ScalarSubF(a) => { + let s = f32x4_splat(a); + a0lo = f32x4_sub(a0lo, s); a0hi = f32x4_sub(a0hi, s); + a1lo = f32x4_sub(a1lo, s); a1hi = f32x4_sub(a1hi, s); + a2lo = f32x4_sub(a2lo, s); a2hi = f32x4_sub(a2hi, s); + a3lo = f32x4_sub(a3lo, s); a3hi = f32x4_sub(a3hi, s); + a4lo = f32x4_sub(a4lo, s); a4hi = f32x4_sub(a4hi, s); + a5lo = f32x4_sub(a5lo, s); a5hi = f32x4_sub(a5hi, s); + a6lo = f32x4_sub(a6lo, s); a6hi = f32x4_sub(a6hi, s); + a7lo = f32x4_sub(a7lo, s); a7hi = f32x4_sub(a7hi, s); + } + FusedKerSpec::LeakyRelu(a) => { + let s = f32x4_splat(a); + let zero = f32x4_splat(0.0); + let m0a = f32x4_gt(a0lo, zero); a0lo = v128_bitselect(a0lo, f32x4_mul(s, a0lo), m0a); + let m0b = f32x4_gt(a0hi, zero); a0hi = v128_bitselect(a0hi, f32x4_mul(s, a0hi), m0b); + let m1a = f32x4_gt(a1lo, zero); a1lo = v128_bitselect(a1lo, f32x4_mul(s, a1lo), m1a); + let m1b = f32x4_gt(a1hi, zero); a1hi = v128_bitselect(a1hi, f32x4_mul(s, a1hi), m1b); + let m2a = f32x4_gt(a2lo, zero); a2lo = v128_bitselect(a2lo, f32x4_mul(s, a2lo), m2a); + let m2b = f32x4_gt(a2hi, zero); a2hi = v128_bitselect(a2hi, f32x4_mul(s, a2hi), m2b); + let m3a = f32x4_gt(a3lo, zero); a3lo = v128_bitselect(a3lo, f32x4_mul(s, a3lo), m3a); + let m3b = f32x4_gt(a3hi, zero); a3hi = v128_bitselect(a3hi, f32x4_mul(s, a3hi), m3b); + let m4a = f32x4_gt(a4lo, zero); a4lo = v128_bitselect(a4lo, f32x4_mul(s, a4lo), m4a); + let m4b = f32x4_gt(a4hi, zero); a4hi = v128_bitselect(a4hi, f32x4_mul(s, a4hi), m4b); + let m5a = f32x4_gt(a5lo, zero); a5lo = v128_bitselect(a5lo, f32x4_mul(s, a5lo), m5a); + let m5b = f32x4_gt(a5hi, zero); a5hi = v128_bitselect(a5hi, f32x4_mul(s, a5hi), m5b); + let m6a = f32x4_gt(a6lo, zero); a6lo = v128_bitselect(a6lo, f32x4_mul(s, a6lo), m6a); + let m6b = f32x4_gt(a6hi, zero); a6hi = v128_bitselect(a6hi, f32x4_mul(s, a6hi), m6b); + let m7a = f32x4_gt(a7lo, zero); a7lo = v128_bitselect(a7lo, f32x4_mul(s, a7lo), m7a); + let m7b = f32x4_gt(a7hi, zero); a7hi = v128_bitselect(a7hi, f32x4_mul(s, a7hi), m7b); + } + FusedKerSpec::PerRowMin(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); a0lo = f32x4_min(r0, a0lo); a0hi = f32x4_min(r0, a0hi); + let r1 = f32x4_splat(r[1]); a1lo = f32x4_min(r1, a1lo); a1hi = f32x4_min(r1, a1hi); + let r2 = f32x4_splat(r[2]); a2lo = f32x4_min(r2, a2lo); a2hi = f32x4_min(r2, a2hi); + let r3 = f32x4_splat(r[3]); a3lo = f32x4_min(r3, a3lo); a3hi = f32x4_min(r3, a3hi); + let r4 = f32x4_splat(r[4]); a4lo = f32x4_min(r4, a4lo); a4hi = f32x4_min(r4, a4hi); + let r5 = f32x4_splat(r[5]); a5lo = f32x4_min(r5, a5lo); a5hi = f32x4_min(r5, a5hi); + let r6 = f32x4_splat(r[6]); a6lo = f32x4_min(r6, a6lo); a6hi = f32x4_min(r6, a6hi); + let r7 = f32x4_splat(r[7]); a7lo = f32x4_min(r7, a7lo); a7hi = f32x4_min(r7, a7hi); + } + FusedKerSpec::PerRowMax(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); a0lo = f32x4_max(r0, a0lo); a0hi = f32x4_max(r0, a0hi); + let r1 = f32x4_splat(r[1]); a1lo = f32x4_max(r1, a1lo); a1hi = f32x4_max(r1, a1hi); + let r2 = f32x4_splat(r[2]); a2lo = f32x4_max(r2, a2lo); a2hi = f32x4_max(r2, a2hi); + let r3 = f32x4_splat(r[3]); a3lo = f32x4_max(r3, a3lo); a3hi = f32x4_max(r3, a3hi); + let r4 = f32x4_splat(r[4]); a4lo = f32x4_max(r4, a4lo); a4hi = f32x4_max(r4, a4hi); + let r5 = f32x4_splat(r[5]); a5lo = f32x4_max(r5, a5lo); a5hi = f32x4_max(r5, a5hi); + let r6 = f32x4_splat(r[6]); a6lo = f32x4_max(r6, a6lo); a6hi = f32x4_max(r6, a6hi); + let r7 = f32x4_splat(r[7]); a7lo = f32x4_max(r7, a7lo); a7hi = f32x4_max(r7, a7hi); + } + FusedKerSpec::PerRowAdd(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); a0lo = f32x4_add(r0, a0lo); a0hi = f32x4_add(r0, a0hi); + let r1 = f32x4_splat(r[1]); a1lo = f32x4_add(r1, a1lo); a1hi = f32x4_add(r1, a1hi); + let r2 = f32x4_splat(r[2]); a2lo = f32x4_add(r2, a2lo); a2hi = f32x4_add(r2, a2hi); + let r3 = f32x4_splat(r[3]); a3lo = f32x4_add(r3, a3lo); a3hi = f32x4_add(r3, a3hi); + let r4 = f32x4_splat(r[4]); a4lo = f32x4_add(r4, a4lo); a4hi = f32x4_add(r4, a4hi); + let r5 = f32x4_splat(r[5]); a5lo = f32x4_add(r5, a5lo); a5hi = f32x4_add(r5, a5hi); + let r6 = f32x4_splat(r[6]); a6lo = f32x4_add(r6, a6lo); a6hi = f32x4_add(r6, a6hi); + let r7 = f32x4_splat(r[7]); a7lo = f32x4_add(r7, a7lo); a7hi = f32x4_add(r7, a7hi); + } + FusedKerSpec::PerRowMul(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); a0lo = f32x4_mul(r0, a0lo); a0hi = f32x4_mul(r0, a0hi); + let r1 = f32x4_splat(r[1]); a1lo = f32x4_mul(r1, a1lo); a1hi = f32x4_mul(r1, a1hi); + let r2 = f32x4_splat(r[2]); a2lo = f32x4_mul(r2, a2lo); a2hi = f32x4_mul(r2, a2hi); + let r3 = f32x4_splat(r[3]); a3lo = f32x4_mul(r3, a3lo); a3hi = f32x4_mul(r3, a3hi); + let r4 = f32x4_splat(r[4]); a4lo = f32x4_mul(r4, a4lo); a4hi = f32x4_mul(r4, a4hi); + let r5 = f32x4_splat(r[5]); a5lo = f32x4_mul(r5, a5lo); a5hi = f32x4_mul(r5, a5hi); + let r6 = f32x4_splat(r[6]); a6lo = f32x4_mul(r6, a6lo); a6hi = f32x4_mul(r6, a6hi); + let r7 = f32x4_splat(r[7]); a7lo = f32x4_mul(r7, a7lo); a7hi = f32x4_mul(r7, a7hi); + } + FusedKerSpec::PerRowSub(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); a0lo = f32x4_sub(r0, a0lo); a0hi = f32x4_sub(r0, a0hi); + let r1 = f32x4_splat(r[1]); a1lo = f32x4_sub(r1, a1lo); a1hi = f32x4_sub(r1, a1hi); + let r2 = f32x4_splat(r[2]); a2lo = f32x4_sub(r2, a2lo); a2hi = f32x4_sub(r2, a2hi); + let r3 = f32x4_splat(r[3]); a3lo = f32x4_sub(r3, a3lo); a3hi = f32x4_sub(r3, a3hi); + let r4 = f32x4_splat(r[4]); a4lo = f32x4_sub(r4, a4lo); a4hi = f32x4_sub(r4, a4hi); + let r5 = f32x4_splat(r[5]); a5lo = f32x4_sub(r5, a5lo); a5hi = f32x4_sub(r5, a5hi); + let r6 = f32x4_splat(r[6]); a6lo = f32x4_sub(r6, a6lo); a6hi = f32x4_sub(r6, a6hi); + let r7 = f32x4_splat(r[7]); a7lo = f32x4_sub(r7, a7lo); a7hi = f32x4_sub(r7, a7hi); + } + FusedKerSpec::PerRowSubF(row) => { + let r = std::slice::from_raw_parts(row, 8); + let r0 = f32x4_splat(r[0]); a0lo = f32x4_sub(a0lo, r0); a0hi = f32x4_sub(a0hi, r0); + let r1 = f32x4_splat(r[1]); a1lo = f32x4_sub(a1lo, r1); a1hi = f32x4_sub(a1hi, r1); + let r2 = f32x4_splat(r[2]); a2lo = f32x4_sub(a2lo, r2); a2hi = f32x4_sub(a2hi, r2); + let r3 = f32x4_splat(r[3]); a3lo = f32x4_sub(a3lo, r3); a3hi = f32x4_sub(a3hi, r3); + let r4 = f32x4_splat(r[4]); a4lo = f32x4_sub(a4lo, r4); a4hi = f32x4_sub(a4hi, r4); + let r5 = f32x4_splat(r[5]); a5lo = f32x4_sub(a5lo, r5); a5hi = f32x4_sub(a5hi, r5); + let r6 = f32x4_splat(r[6]); a6lo = f32x4_sub(a6lo, r6); a6hi = f32x4_sub(a6hi, r6); + let r7 = f32x4_splat(r[7]); a7lo = f32x4_sub(a7lo, r7); a7hi = f32x4_sub(a7hi, r7); + } + FusedKerSpec::PerColMin(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); let chi = v128_load(p.add(1)); + a0lo = f32x4_min(clo, a0lo); a0hi = f32x4_min(chi, a0hi); + a1lo = f32x4_min(clo, a1lo); a1hi = f32x4_min(chi, a1hi); + a2lo = f32x4_min(clo, a2lo); a2hi = f32x4_min(chi, a2hi); + a3lo = f32x4_min(clo, a3lo); a3hi = f32x4_min(chi, a3hi); + a4lo = f32x4_min(clo, a4lo); a4hi = f32x4_min(chi, a4hi); + a5lo = f32x4_min(clo, a5lo); a5hi = f32x4_min(chi, a5hi); + a6lo = f32x4_min(clo, a6lo); a6hi = f32x4_min(chi, a6hi); + a7lo = f32x4_min(clo, a7lo); a7hi = f32x4_min(chi, a7hi); + } + FusedKerSpec::PerColMax(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); let chi = v128_load(p.add(1)); + a0lo = f32x4_max(clo, a0lo); a0hi = f32x4_max(chi, a0hi); + a1lo = f32x4_max(clo, a1lo); a1hi = f32x4_max(chi, a1hi); + a2lo = f32x4_max(clo, a2lo); a2hi = f32x4_max(chi, a2hi); + a3lo = f32x4_max(clo, a3lo); a3hi = f32x4_max(chi, a3hi); + a4lo = f32x4_max(clo, a4lo); a4hi = f32x4_max(chi, a4hi); + a5lo = f32x4_max(clo, a5lo); a5hi = f32x4_max(chi, a5hi); + a6lo = f32x4_max(clo, a6lo); a6hi = f32x4_max(chi, a6hi); + a7lo = f32x4_max(clo, a7lo); a7hi = f32x4_max(chi, a7hi); + } + FusedKerSpec::PerColAdd(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); let chi = v128_load(p.add(1)); + a0lo = f32x4_add(clo, a0lo); a0hi = f32x4_add(chi, a0hi); + a1lo = f32x4_add(clo, a1lo); a1hi = f32x4_add(chi, a1hi); + a2lo = f32x4_add(clo, a2lo); a2hi = f32x4_add(chi, a2hi); + a3lo = f32x4_add(clo, a3lo); a3hi = f32x4_add(chi, a3hi); + a4lo = f32x4_add(clo, a4lo); a4hi = f32x4_add(chi, a4hi); + a5lo = f32x4_add(clo, a5lo); a5hi = f32x4_add(chi, a5hi); + a6lo = f32x4_add(clo, a6lo); a6hi = f32x4_add(chi, a6hi); + a7lo = f32x4_add(clo, a7lo); a7hi = f32x4_add(chi, a7hi); + } + FusedKerSpec::PerColMul(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); let chi = v128_load(p.add(1)); + a0lo = f32x4_mul(clo, a0lo); a0hi = f32x4_mul(chi, a0hi); + a1lo = f32x4_mul(clo, a1lo); a1hi = f32x4_mul(chi, a1hi); + a2lo = f32x4_mul(clo, a2lo); a2hi = f32x4_mul(chi, a2hi); + a3lo = f32x4_mul(clo, a3lo); a3hi = f32x4_mul(chi, a3hi); + a4lo = f32x4_mul(clo, a4lo); a4hi = f32x4_mul(chi, a4hi); + a5lo = f32x4_mul(clo, a5lo); a5hi = f32x4_mul(chi, a5hi); + a6lo = f32x4_mul(clo, a6lo); a6hi = f32x4_mul(chi, a6hi); + a7lo = f32x4_mul(clo, a7lo); a7hi = f32x4_mul(chi, a7hi); + } + FusedKerSpec::PerColSub(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); let chi = v128_load(p.add(1)); + a0lo = f32x4_sub(clo, a0lo); a0hi = f32x4_sub(chi, a0hi); + a1lo = f32x4_sub(clo, a1lo); a1hi = f32x4_sub(chi, a1hi); + a2lo = f32x4_sub(clo, a2lo); a2hi = f32x4_sub(chi, a2hi); + a3lo = f32x4_sub(clo, a3lo); a3hi = f32x4_sub(chi, a3hi); + a4lo = f32x4_sub(clo, a4lo); a4hi = f32x4_sub(chi, a4hi); + a5lo = f32x4_sub(clo, a5lo); a5hi = f32x4_sub(chi, a5hi); + a6lo = f32x4_sub(clo, a6lo); a6hi = f32x4_sub(chi, a6hi); + a7lo = f32x4_sub(clo, a7lo); a7hi = f32x4_sub(chi, a7hi); + } + FusedKerSpec::PerColSubF(cols) => { + let p = cols as *const v128; + let clo = v128_load(p); let chi = v128_load(p.add(1)); + a0lo = f32x4_sub(a0lo, clo); a0hi = f32x4_sub(a0hi, chi); + a1lo = f32x4_sub(a1lo, clo); a1hi = f32x4_sub(a1hi, chi); + a2lo = f32x4_sub(a2lo, clo); a2hi = f32x4_sub(a2hi, chi); + a3lo = f32x4_sub(a3lo, clo); a3hi = f32x4_sub(a3hi, chi); + a4lo = f32x4_sub(a4lo, clo); a4hi = f32x4_sub(a4hi, chi); + a5lo = f32x4_sub(a5lo, clo); a5hi = f32x4_sub(a5hi, chi); + a6lo = f32x4_sub(a6lo, clo); a6hi = f32x4_sub(a6hi, chi); + a7lo = f32x4_sub(a7lo, clo); a7hi = f32x4_sub(a7hi, chi); + } + FusedKerSpec::QScale(shift, rp, mult) => { + let scaler = Scaler::from_fuse_params(shift, rp, mult); + let s = f32x4_splat(scaler.scale); + a0lo = f32x4_mul(s, a0lo); a0hi = f32x4_mul(s, a0hi); + a1lo = f32x4_mul(s, a1lo); a1hi = f32x4_mul(s, a1hi); + a2lo = f32x4_mul(s, a2lo); a2hi = f32x4_mul(s, a2hi); + a3lo = f32x4_mul(s, a3lo); a3hi = f32x4_mul(s, a3hi); + a4lo = f32x4_mul(s, a4lo); a4hi = f32x4_mul(s, a4hi); + a5lo = f32x4_mul(s, a5lo); a5hi = f32x4_mul(s, a5hi); + a6lo = f32x4_mul(s, a6lo); a6hi = f32x4_mul(s, a6hi); + a7lo = f32x4_mul(s, a7lo); a7hi = f32x4_mul(s, a7hi); + } + FusedKerSpec::RoundingShiftRight(shift, _rp) => { + let s = f32x4_splat(2f32.powi(-(shift as i32))); + a0lo = f32x4_mul(s, a0lo); a0hi = f32x4_mul(s, a0hi); + a1lo = f32x4_mul(s, a1lo); a1hi = f32x4_mul(s, a1hi); + a2lo = f32x4_mul(s, a2lo); a2hi = f32x4_mul(s, a2hi); + a3lo = f32x4_mul(s, a3lo); a3hi = f32x4_mul(s, a3hi); + a4lo = f32x4_mul(s, a4lo); a4hi = f32x4_mul(s, a4hi); + a5lo = f32x4_mul(s, a5lo); a5hi = f32x4_mul(s, a5hi); + a6lo = f32x4_mul(s, a6lo); a6hi = f32x4_mul(s, a6hi); + a7lo = f32x4_mul(s, a7lo); a7hi = f32x4_mul(s, a7hi); + } + FusedKerSpec::ShiftLeft(shift) => { + let s = f32x4_splat(2f32.powi(shift as i32)); + a0lo = f32x4_mul(s, a0lo); a0hi = f32x4_mul(s, a0hi); + a1lo = f32x4_mul(s, a1lo); a1hi = f32x4_mul(s, a1hi); + a2lo = f32x4_mul(s, a2lo); a2hi = f32x4_mul(s, a2hi); + a3lo = f32x4_mul(s, a3lo); a3hi = f32x4_mul(s, a3hi); + a4lo = f32x4_mul(s, a4lo); a4hi = f32x4_mul(s, a4hi); + a5lo = f32x4_mul(s, a5lo); a5hi = f32x4_mul(s, a5hi); + a6lo = f32x4_mul(s, a6lo); a6hi = f32x4_mul(s, a6hi); + a7lo = f32x4_mul(s, a7lo); a7hi = f32x4_mul(s, a7hi); + } + FusedKerSpec::AddUnicast(tile) => { + // 8 rows × 8 cols, each row laid out per col_byte_stride + let mut ptr: *const u8 = tile.ptr; + for ab_pair in [(&mut a0lo, &mut a0hi), (&mut a1lo, &mut a1hi), + (&mut a2lo, &mut a2hi), (&mut a3lo, &mut a3hi), + (&mut a4lo, &mut a4hi), (&mut a5lo, &mut a5hi), + (&mut a6lo, &mut a6hi), (&mut a7lo, &mut a7hi)].iter_mut() { + let m0 = *(ptr as *const f32); + let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32); + let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32); + let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32); + let m4 = *(ptr.offset(tile.col_byte_stride * 4) as *const f32); + let m5 = *(ptr.offset(tile.col_byte_stride * 5) as *const f32); + let m6 = *(ptr.offset(tile.col_byte_stride * 6) as *const f32); + let m7 = *(ptr.offset(tile.col_byte_stride * 7) as *const f32); + let (lo, hi) = ab_pair; + **lo = f32x4_add(**lo, f32x4(m0, m1, m2, m3)); + **hi = f32x4_add(**hi, f32x4(m4, m5, m6, m7)); + ptr = ptr.add(tile.row_byte_stride as usize); + } + } + FusedKerSpec::AddRowColProducts(rows, cols) => { + let p = cols as *const v128; + let clo = v128_load(p); let chi = v128_load(p.add(1)); + let r0 = f32x4_splat(*rows.add(0)); a0lo = f32x4_add(a0lo, f32x4_mul(r0, clo)); a0hi = f32x4_add(a0hi, f32x4_mul(r0, chi)); + let r1 = f32x4_splat(*rows.add(1)); a1lo = f32x4_add(a1lo, f32x4_mul(r1, clo)); a1hi = f32x4_add(a1hi, f32x4_mul(r1, chi)); + let r2 = f32x4_splat(*rows.add(2)); a2lo = f32x4_add(a2lo, f32x4_mul(r2, clo)); a2hi = f32x4_add(a2hi, f32x4_mul(r2, chi)); + let r3 = f32x4_splat(*rows.add(3)); a3lo = f32x4_add(a3lo, f32x4_mul(r3, clo)); a3hi = f32x4_add(a3hi, f32x4_mul(r3, chi)); + let r4 = f32x4_splat(*rows.add(4)); a4lo = f32x4_add(a4lo, f32x4_mul(r4, clo)); a4hi = f32x4_add(a4hi, f32x4_mul(r4, chi)); + let r5 = f32x4_splat(*rows.add(5)); a5lo = f32x4_add(a5lo, f32x4_mul(r5, clo)); a5hi = f32x4_add(a5hi, f32x4_mul(r5, chi)); + let r6 = f32x4_splat(*rows.add(6)); a6lo = f32x4_add(a6lo, f32x4_mul(r6, clo)); a6hi = f32x4_add(a6hi, f32x4_mul(r6, chi)); + let r7 = f32x4_splat(*rows.add(7)); a7lo = f32x4_add(a7lo, f32x4_mul(r7, clo)); a7hi = f32x4_add(a7hi, f32x4_mul(r7, chi)); + } + FusedKerSpec::Store(tile) => { + // 8 rows × 8 cols stores + let mut ptr: *mut u8 = tile.ptr; + for (lo, hi) in [(a0lo, a0hi), (a1lo, a1hi), (a2lo, a2hi), (a3lo, a3hi), + (a4lo, a4hi), (a5lo, a5hi), (a6lo, a6hi), (a7lo, a7hi)].iter() { + *(ptr as *mut f32) = f32x4_extract_lane::<0>(*lo); + *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(*lo); + *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = f32x4_extract_lane::<2>(*lo); + *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = f32x4_extract_lane::<3>(*lo); + *(ptr.offset(tile.col_byte_stride * 4) as *mut f32) = f32x4_extract_lane::<0>(*hi); + *(ptr.offset(tile.col_byte_stride * 5) as *mut f32) = f32x4_extract_lane::<1>(*hi); + *(ptr.offset(tile.col_byte_stride * 6) as *mut f32) = f32x4_extract_lane::<2>(*hi); + *(ptr.offset(tile.col_byte_stride * 7) as *mut f32) = f32x4_extract_lane::<3>(*hi); + ptr = ptr.add(tile.row_byte_stride as usize); + } + } + FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => { + // A: packed [k][MR=8] = each k iter loads 8 row values + // B: packed [k][NR=8] = each k iter loads 8 col values as 2 v128 + let a = pa as *const f32; + let b = pb as *const v128; + for i in 0..k { + let arow = std::slice::from_raw_parts(a.offset(8 * i as isize), 8); + let blo = v128_load(b.offset((2 * i) as isize)); + let bhi = v128_load(b.offset((2 * i + 1) as isize)); + let s = f32x4_splat(arow[0]); a0lo = f32x4_add(a0lo, f32x4_mul(s, blo)); a0hi = f32x4_add(a0hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[1]); a1lo = f32x4_add(a1lo, f32x4_mul(s, blo)); a1hi = f32x4_add(a1hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[2]); a2lo = f32x4_add(a2lo, f32x4_mul(s, blo)); a2hi = f32x4_add(a2hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[3]); a3lo = f32x4_add(a3lo, f32x4_mul(s, blo)); a3hi = f32x4_add(a3hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[4]); a4lo = f32x4_add(a4lo, f32x4_mul(s, blo)); a4hi = f32x4_add(a4hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[5]); a5lo = f32x4_add(a5lo, f32x4_mul(s, blo)); a5hi = f32x4_add(a5hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[6]); a6lo = f32x4_add(a6lo, f32x4_mul(s, blo)); a6hi = f32x4_add(a6hi, f32x4_mul(s, bhi)); + let s = f32x4_splat(arow[7]); a7lo = f32x4_add(a7lo, f32x4_mul(s, blo)); a7hi = f32x4_add(a7hi, f32x4_mul(s, bhi)); + } + } + } + pnl = pnl.add(1); + } + 0 +} + +MMMRustKernel!(kernel_f32_8x8 => wasm_f32_8x8(8,8)@(8,8) quality(ImplementationQuality::TargetOptimized)); diff --git a/vendor/tract-linalg-0.22.1/src/x86_64_fma.rs b/vendor/tract-linalg-0.22.1/src/x86_64_fma.rs new file mode 100644 index 000000000..271aaeeaf --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/x86_64_fma.rs @@ -0,0 +1,49 @@ +use crate::frame::element_wise::ElementWiseKer; +use crate::frame::reduce::{MapReduceKer, ReduceKer}; +use crate::x86_64_fma::softmax::x86_64_fma_softmax2_fastcompact_f32_32n; +use crate::Ops; + +pub mod mmm; + +pub mod by_scalar; +mod intel; +pub mod max; +pub mod panel_extract; +pub mod softmax; + +const AVX2: fn() -> bool = || is_x86_feature_detected!("avx2"); +const FMA: fn() -> bool = || is_x86_feature_detected!("fma"); +const AVX512F: fn() -> bool = || is_x86_feature_detected!("avx512f"); + +tanh_impl!(f32, fma_tanh_f32, 8, 8, is_x86_feature_detected!("fma")); +sigmoid_impl!(f32, fma_sigmoid_f32, 8, 8, is_x86_feature_detected!("fma")); + +fn plug_avx2(_ops: &mut Ops) {} + +fn plug_fma(ops: &mut Ops) { + panel_extract::plug(ops); + + ops.sigmoid_f32 = Box::new(|| fma_sigmoid_f32::ew()); + ops.tanh_f32 = Box::new(|| fma_tanh_f32::ew()); + + ops.mul_by_scalar_f32 = Box::new(|| by_scalar::x86_64_avx_f32_mul_by_scalar_32n::ew()); + ops.max_f32 = Box::new(|| max::x86_64_fma_max_f32_32n::red()); + ops.softmax2_fastcompact_f32 = Box::new(|| x86_64_fma_softmax2_fastcompact_f32_32n::red()); + + log::info!("sigmoid_f32, tanh_f32: x86_64/fma activated"); +} + +fn plug_avx512f(_ops: &mut Ops) {} + +pub fn plug(ops: &mut Ops) { + mmm::plug(ops); + if is_x86_feature_detected!("avx2") { + plug_avx2(ops); + if is_x86_feature_detected!("fma") { + plug_fma(ops); + if is_x86_feature_detected!("avx512f") { + plug_avx512f(ops); + } + } + } +} diff --git a/vendor/tract-linalg-0.22.1/src/x86_64_fma/by_scalar.rs b/vendor/tract-linalg-0.22.1/src/x86_64_fma/by_scalar.rs new file mode 100644 index 000000000..dacef2425 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/x86_64_fma/by_scalar.rs @@ -0,0 +1,56 @@ +ew_impl_wrap!( + f32, + x86_64_avx_f32_mul_by_scalar_32n, + 32, + 8, + f32, + fn run(x: &mut [f32], s: f32) { + debug_assert!(x.len() % Self::nr() == 0); + debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0); + unsafe { x86_64_avx_f32_mul_by_scalar_32n_run(x, s) } + } +); + +#[target_feature(enable = "avx")] +unsafe fn x86_64_avx_f32_mul_by_scalar_32n_run(buf: &mut [f32], scalar: f32) { + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + std::arch::asm!(" + vbroadcastss ymm0, xmm0 + 2: + vmovaps ymm4, [{ptr}] + vmovaps ymm5, [{ptr} + 32] + vmovaps ymm6, [{ptr} + 64] + vmovaps ymm7, [{ptr} + 96] + vmulps ymm4, ymm4, ymm0 + vmulps ymm5, ymm5, ymm0 + vmulps ymm6, ymm6, ymm0 + vmulps ymm7, ymm7, ymm0 + vmovaps [{ptr}], ymm4 + vmovaps [{ptr} + 32], ymm5 + vmovaps [{ptr} + 64], ymm6 + vmovaps [{ptr} + 96], ymm7 + add {ptr}, 128 + sub {len}, 32 + jnz 2b + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + in("xmm0") scalar, + out("ymm4") _, out("ymm5") _, out("ymm6") _, out("ymm7") _ + ); + } +} + +#[cfg(test)] +#[macro_use] +pub mod test_x86_64_avx_f32_mul_by_scalar_32n { + use super::*; + by_scalar_frame_tests!( + is_x86_feature_detected!("avx2"), + f32, + x86_64_avx_f32_mul_by_scalar_32n, + |a, b| a * b + ); +} diff --git a/vendor/tract-linalg-0.22.1/src/x86_64_fma/intel.rs b/vendor/tract-linalg-0.22.1/src/x86_64_fma/intel.rs new file mode 100644 index 000000000..277fb6986 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/x86_64_fma/intel.rs @@ -0,0 +1,5 @@ +use crate::frame::mmm::cost_model::CostModel; +#[allow(dead_code)] +pub fn models() -> Vec<(&'static str, CostModel<'static>)> { +vec!( +)} diff --git a/vendor/tract-linalg-0.22.1/src/x86_64_fma/max.rs b/vendor/tract-linalg-0.22.1/src/x86_64_fma/max.rs new file mode 100644 index 000000000..cea571047 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/x86_64_fma/max.rs @@ -0,0 +1,67 @@ +reduce_impl_wrap!( + f32, + x86_64_fma_max_f32_32n, + 32, + 8, + (), + f32::MIN, + #[inline(never)] + fn run(buf: &[f32], _: ()) -> f32 { + assert!(buf.len() % 32 == 0); + assert!(buf.len() > 0); + unsafe { x86_64_fma_max_f32_32n_run(buf) } + }, + #[inline(never)] + fn reduce_two(a: f32, b: f32) -> f32 { + a.max(b) + } +); + +#[target_feature(enable = "avx")] +unsafe fn x86_64_fma_max_f32_32n_run(buf: &[f32]) -> f32 { + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + let mut acc = f32::MIN; + std::arch::asm!(" + vbroadcastss ymm0, xmm0 + vmovaps ymm1, ymm0 + vmovaps ymm2, ymm0 + vmovaps ymm3, ymm0 + 2: + vmovaps ymm4, [{ptr}] + vmovaps ymm5, [{ptr} + 32] + vmovaps ymm6, [{ptr} + 64] + vmovaps ymm7, [{ptr} + 96] + vmaxps ymm0, ymm0, ymm4 + vmaxps ymm1, ymm1, ymm5 + vmaxps ymm2, ymm2, ymm6 + vmaxps ymm3, ymm3, ymm7 + add {ptr}, 128 + sub {len}, 32 + jnz 2b + vmaxps ymm0, ymm0, ymm1 + vmaxps ymm2, ymm2, ymm3 + vmaxps ymm0, ymm0, ymm2 + vperm2f128 ymm1, ymm0, ymm0, 1 // copy second half (4xf32) of ymm0 to ymm1 + vmaxps xmm0, xmm0, xmm1 // xmm0 contains 4 values to max + vpermilps xmm1, xmm0, 2 + (3 << 2) // second 2x32 bit half moved to top + vmaxps xmm0, xmm0, xmm1 // xmm0 containes 2 values + vpermilps xmm1, xmm0, 1 // second f32 to top + vmaxps xmm0, xmm0, xmm1 + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + inout("ymm0") acc, + out("ymm1") _, out("ymm2") _, out("ymm3") _, + out("ymm4") _, out("ymm5") _, out("ymm6") _, out("ymm7") _ + ); + acc + } +} + +#[cfg(test)] +mod test_x86_64_fma_max_f32_32n { + use super::*; + crate::max_frame_tests!(is_x86_feature_detected!("avx2"), f32, x86_64_fma_max_f32_32n); +} diff --git a/vendor/tract-linalg-0.22.1/src/x86_64_fma/mmm.rs b/vendor/tract-linalg-0.22.1/src/x86_64_fma/mmm.rs new file mode 100644 index 000000000..2bf936304 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/x86_64_fma/mmm.rs @@ -0,0 +1,172 @@ +use crate::block_quant::*; +use crate::mmm::ImplementationQuality::ManuallyOptimized; +use crate::pack::PackedFormat; +use crate::Ops; +use tract_data::internal::*; +use DatumType::*; + +use super::*; + +MMMExternKernel!(fma_mmm_f32_8x8 (8, 8)@(256,4) where(FMA) quality(ManuallyOptimized)); +MMMExternKernel!(fma_mmm_f32_16x6(16,6)@(256,4) where(FMA) quality(ManuallyOptimized)); +MMMExternKernel!(fma_mmm_f32_16x5(16,5)@(256,4) where(FMA) quality(ManuallyOptimized)); +MMMExternKernel!(fma_mmm_f32_24x4(24,4)@(256,4) where(FMA) quality(ManuallyOptimized)); +MMMExternKernel!(fma_mmm_f32_40x2(40,2)@(256,4) where(FMA) quality(ManuallyOptimized)); +MMMExternKernel!(fma_mmm_f32_64x1(64,1)@(256,4) where(FMA) quality(ManuallyOptimized)); + +pub fn pq40_r32() -> PackedBlockQuantFormat { + PackedBlockQuantFormat::new(&Q4_0, 32, 16, false) +} +MMMExternKernel! {fma_mmm_f32_32x1(32,1)@(256,4) where(FMA) + packing[1] = q40f32 => |k| k.with_packing_a(pq40_r32()); + packing[2] = q40f16 => |k| k.with_packing(pq40_r32(), PackedFormat::new(F16, 1, 2)); + packing[3] = f16f16 => |k| k.with_packing(PackedFormat::new(F16, 32, 32), PackedFormat::new(F16, 1, 2)); + quality(ManuallyOptimized) + store(f16) +} +MMMExternKernel!(fma_mmm_f32_32x3(32,3)@(256,4) where(FMA) + packing[1] = f32f16 => |k| k.with_packing(f32::packing(32).align(256), f16::packing(3)); + quality(ManuallyOptimized) + store(f16) +); + +MMMExternKernel!(avx512_mmm_f32_128x1(128, 1)@(512,4) where (AVX512F) quality(ManuallyOptimized)); +MMMExternKernel!(avx512_mmm_f32_16x1 ( 16, 1)@(512,4) where (AVX512F) quality(ManuallyOptimized)); +MMMExternKernel!(avx512_mmm_f32_16x12( 16,12)@(512,4) where (AVX512F) quality(ManuallyOptimized)); +MMMExternKernel!(avx512_mmm_f32_16x8 ( 16, 8)@(512,4) where (AVX512F) quality(ManuallyOptimized)); +MMMExternKernel!(avx512_mmm_f32_32x6 ( 32, 6)@(512,4) where (AVX512F) quality(ManuallyOptimized)); +MMMExternKernel!(avx512_mmm_f32_32x5 ( 32, 5)@(512,4) where (AVX512F) quality(ManuallyOptimized)); +MMMExternKernel!(avx512_mmm_f32_48x4 ( 48, 4)@(512,4) where (AVX512F) quality(ManuallyOptimized)); +MMMExternKernel!(avx512_mmm_f32_64x3 ( 64, 3)@(512,4) where (AVX512F) quality(ManuallyOptimized)); +MMMExternKernel!(avx512_mmm_f32_80x2 ( 80, 2)@(512,4) where (AVX512F) quality(ManuallyOptimized)); + +MMMExternKernel! { avx2_mmm_i32_8x8(8,8)@(256,4) where(AVX2) + packing[1] = i8i8 => |k| k.with_packing(PackedFormat::new(DatumType::I8, 8, 256), PackedFormat::new(DatumType::I8, 8, 4)); + quality(ManuallyOptimized) + store(i8) +} + +pub fn plug(ops: &mut Ops) { + if is_x86_feature_detected!("avx2") { + plug_avx2(ops); + if is_x86_feature_detected!("fma") { + plug_fma(ops); + if is_x86_feature_detected!("avx512f") { + plug_avx512f(ops); + } + } + } +} + +pub fn plug_avx2(ops: &mut Ops) { + ops.mmm_impls.push(mmm::avx2_mmm_i32_8x8.mmm()); + ops.qmmm_i32 = Box::new(|_, _, _| mmm::avx2_mmm_i32_8x8.mmm()); + log::info!("qmmm_i32: x86_64/avx2 activated"); +} + +pub fn plug_fma(ops: &mut Ops) { + ops.mmm_impls.extend([ + fma_mmm_f32_8x8.mmm(), + fma_mmm_f32_16x5.mmm(), + fma_mmm_f32_16x6.mmm(), + fma_mmm_f32_24x4.mmm(), + fma_mmm_f32_32x3.mmm(), + fma_mmm_f32_40x2.mmm(), + fma_mmm_f32_64x1.mmm(), + ]); + + ops.mmv_f32 = Box::new(|_, _| fma_mmm_f32_64x1.mmm()); + + ops.mmm_f32 = Box::new(|_, _, n| { + if n.is_none() { + return fma_mmm_f32_16x6.mmm(); + } + + let n = n.unwrap(); + + match n { + 1 => unreachable!("should've been mmv"), + 2 => return fma_mmm_f32_40x2.mmm(), + 3 => return fma_mmm_f32_32x3.mmm(), + 4 => return fma_mmm_f32_24x4.mmm(), + 5 => return fma_mmm_f32_16x5.mmm(), + 6 => return fma_mmm_f32_16x6.mmm(), + 8 => return fma_mmm_f32_8x8.mmm(), + _ => {} + }; + + let scaling_baseline = 60.0; + let kernel_normalized_perf = [ + 44.0 / scaling_baseline, // 8x8 + 54.0 / scaling_baseline, // 2x6 + 54.0 / scaling_baseline, // 2x5 + 54.0 / scaling_baseline, // 3x4 + 54.0 / scaling_baseline, // 4x3 + 54.0 / scaling_baseline, // 5x2 + ]; + + fn compute_efficiency(n: usize, kernel_width: usize, scale: f32) -> f32 { + let kernel_width = kernel_width as f32; + let n = n as f32; + let batch_count = (n / kernel_width).ceil(); + let actual_count = batch_count * kernel_width; + let multi_batch_penalty = 1.0 - batch_count / 100.0; + n / actual_count * scale * multi_batch_penalty + } + + let efficiencies = [ + compute_efficiency(n, 8, kernel_normalized_perf[0]), + compute_efficiency(n, 6, kernel_normalized_perf[1]), + compute_efficiency(n, 5, kernel_normalized_perf[2]), + compute_efficiency(n, 4, kernel_normalized_perf[3]), + compute_efficiency(n, 3, kernel_normalized_perf[4]), + compute_efficiency(n, 2, kernel_normalized_perf[5]), + ]; + + let best_idx = efficiencies.iter().copied().enumerate().fold((0, 0.0), |max, val| { + if val.1 > max.1 { + val + } else { + max + } + }); + + match best_idx.0 { + 0 => fma_mmm_f32_8x8.mmm(), + 1 => fma_mmm_f32_16x6.mmm(), + 2 => fma_mmm_f32_16x5.mmm(), + 3 => fma_mmm_f32_24x4.mmm(), + 4 => fma_mmm_f32_32x3.mmm(), + 5 => fma_mmm_f32_40x2.mmm(), + _ => unreachable!("not a valid index"), + } + }); + log::info!("mmm_f32, mmv_f32: x86_64/fma activated"); + + if is_x86_feature_detected!("f16c") { + ops.mmm_impls.push(mmm::fma_mmm_f32_32x1.mmm()); // q40f32 requires f16c + log::info!("found f16c, added fake-f16 and q40-able kernels"); + } +} + +pub fn plug_avx512f(ops: &mut Ops) { + ops.mmm_impls.push(avx512_mmm_f32_128x1.mmm()); + ops.mmm_impls.push(avx512_mmm_f32_80x2.mmm()); + ops.mmm_impls.push(avx512_mmm_f32_48x4.mmm()); + ops.mmm_impls.push(avx512_mmm_f32_64x3.mmm()); + ops.mmm_impls.push(avx512_mmm_f32_16x12.mmm()); + ops.mmv_f32 = Box::new(|m, _k| match m { + Some(m) if m < 31 => avx512_mmm_f32_16x1.mmm(), + _ => avx512_mmm_f32_128x1.mmm(), + }); + + ops.mmm_f32 = Box::new(|m, _, n| match (m, n) { + (_, Some(1)) => unreachable!("should've been mmv"), + (_, Some(2)) => avx512_mmm_f32_80x2.mmm(), + (Some(m), _) if m <= 16 => mmm::avx512_mmm_f32_16x12.mmm(), + (_, Some(n)) if n % 4 == 0 && n % 3 != 0 && n < 32 => avx512_mmm_f32_48x4.mmm(), + (_, Some(n)) if n < 32 => avx512_mmm_f32_64x3.mmm(), + _ => avx512_mmm_f32_16x12.mmm(), + }); + log::info!("mmm_f32, mmv_f32: x86_64/avx512f activated"); +} diff --git a/vendor/tract-linalg-0.22.1/src/x86_64_fma/panel_extract.rs b/vendor/tract-linalg-0.22.1/src/x86_64_fma/panel_extract.rs new file mode 100644 index 000000000..3077ba0cf --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/x86_64_fma/panel_extract.rs @@ -0,0 +1,136 @@ +use super::*; +use crate::pack::{PackedFormat, Packing}; +use crate::Ops; +use tract_data::internal::*; + +pub fn plug(ops: &mut Ops) { + ops.panel_extractors.extend([packed_32_q40_to_f32.clone(), packed_32_f16_to_f32.clone()]); +} + +panel_extractor!(kernel_packed_32_q40_to_f32 as packed_32_q40_to_f32( + Box::new(super::mmm::pq40_r32()), + f32::packing(32).align(32) +) where(AVX2)); + +panel_extractor!(kernel_packed_32_f16_to_f32 as packed_32_f16_to_f32( + Box::new(PackedFormat::new(f16::datum_type(), 32, 32)), + f32::packing(32).align(32) +) where(AVX2)); + +#[target_feature(enable = "avx2")] +unsafe fn kernel_packed_32_q40_to_f32(input: *const u8, output: *mut u8, k: usize) { + unsafe { + if k == 0 { + return; + } + debug_assert!(k % 32 == 0); + debug_assert!(output as usize % 32 == 0); + std::arch::asm!(" + vbroadcastss ymm14, dword ptr [{mask}] + vbroadcastss ymm13, dword ptr [{eight}] + + 2: + vmovaps xmm4, [{i}] + vmovaps xmm5, [{i} + 16] + vmovaps xmm6, [{i} + 32] + vmovaps xmm7, [{i} + 48] + vcvtph2ps ymm4, xmm4 + vcvtph2ps ymm5, xmm5 + vcvtph2ps ymm6, xmm6 + vcvtph2ps ymm7, xmm7 + add {i}, 64 + + mov {k2}, 32 + 3: + vmovaps xmm8, [{i}] // 32 nibbles + vpand xmm10, xmm8, xmm14 // 16 bytes + vpmovzxbd ymm9, xmm10 // 8 u32 + + vpermilpd xmm10, xmm10, 1 // swap 64bit halves + vpmovzxbd ymm10, xmm10 // 8 u32 + + vpsrlw xmm8, xmm8, 4 + vpand xmm12, xmm8, xmm14 // 16 bytes + vpmovzxbd ymm11, xmm12 // 8 u32 + vpermilpd xmm12, xmm12, 1 // swap 64bit halves + vpmovzxbd ymm12, xmm12 // 8 u32 + + vpsubd ymm9, ymm9, ymm13 + vpsubd ymm10, ymm10, ymm13 + vpsubd ymm11, ymm11, ymm13 + vpsubd ymm12, ymm12, ymm13 + + vcvtdq2ps ymm9, ymm9 + vcvtdq2ps ymm10, ymm10 + vcvtdq2ps ymm11, ymm11 + vcvtdq2ps ymm12, ymm12 + + vmulps ymm9, ymm9, ymm4 + vmulps ymm10, ymm10, ymm5 + vmulps ymm11, ymm11, ymm6 + vmulps ymm12, ymm12, ymm7 + + vmovaps [{o}], ymm9 + vmovaps [{o}+32], ymm10 + vmovaps [{o}+64], ymm11 + vmovaps [{o}+96], ymm12 + + add {i}, 16 + add {o}, 128 + sub {k2}, 1 + jnz 3b + + sub {k}, 32 + jnz 2b; + ", + mask = in(reg) &0x0F0F0F0F, + eight = in(reg) &0x08, + k = inout(reg) k => _, + k2 = out(reg) _, + i = inout(reg) input => _, + o = inout(reg) output => _, + out("ymm0") _, out("ymm1") _, out("ymm2") _, out("ymm3") _, + out("ymm4") _, out("ymm5") _, out("ymm6") _, out("ymm7") _, + out("ymm8") _, out("ymm9") _, out("ymm10") _, out("ymm11") _, + out("ymm12") _, out("ymm13") _, out("ymm14") _, out("ymm15") _ + ); + } +} + +#[target_feature(enable = "avx2")] +unsafe fn kernel_packed_32_f16_to_f32(input: *const u8, output: *mut u8, k: usize) { + unsafe { + if k == 0 { + return; + } + debug_assert!(output as usize % 32 == 0); + std::arch::asm!(" + 2: + vmovaps xmm4, [{i}] + vmovaps xmm5, [{i} + 16] + vmovaps xmm6, [{i} + 32] + vmovaps xmm7, [{i} + 48] + + vcvtph2ps ymm4, xmm4 + vcvtph2ps ymm5, xmm5 + vcvtph2ps ymm6, xmm6 + vcvtph2ps ymm7, xmm7 + + vmovaps [{o}], ymm4 + vmovaps [{o}+32], ymm5 + vmovaps [{o}+64], ymm6 + vmovaps [{o}+96], ymm7 + + add {i}, 64 + add {o}, 128 + + sub {k}, 1 + jnz 2b; + ", + k = inout(reg) k => _, + i = inout(reg) input => _, + o = inout(reg) output => _, + out("ymm4") _, out("ymm5") _, out("ymm6") _, out("ymm7") _, + ); + } +} diff --git a/vendor/tract-linalg-0.22.1/src/x86_64_fma/softmax.rs b/vendor/tract-linalg-0.22.1/src/x86_64_fma/softmax.rs new file mode 100644 index 000000000..ed63d3ca4 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/src/x86_64_fma/softmax.rs @@ -0,0 +1,121 @@ +map_reduce_impl_wrap!( + f32, + x86_64_fma_softmax2_fastcompact_f32_32n, + 32, + 8, + f32, + f32::MIN, + 0f32, + #[inline(never)] + fn run(buf: &mut [f32], max: f32) -> f32 { + assert!(buf.len() % 32 == 0); + assert!(buf.len() > 0); + unsafe { x86_64_fma_softmax2_fastcompact_f32_32n_run(buf, max) } + }, + #[inline(never)] + fn reduce_two(a: f32, b: f32) -> f32 { + a + b + } +); + +#[target_feature(enable = "avx,fma")] +unsafe fn x86_64_fma_softmax2_fastcompact_f32_32n_run(buf: &mut [f32], max: f32) -> f32 { + unsafe { + let len = buf.len(); + let ptr = buf.as_ptr(); + let mut acc = 0f32; + const MLN2: f32 = 0.6931471805f32; + const A: f32 = 8388608.0f32; + const B: f32 = 1065353216.0f32; + const C: f32 = 60801.0f32; + const SLOPE: f32 = A / MLN2; + const OFFSET: f32 = B - C; + std::arch::asm!(" + vbroadcastss ymm0, xmm0 + vmovaps ymm1, ymm0 + vmovaps ymm2, ymm0 + vmovaps ymm3, ymm0 + + vpxor ymm12, ymm12, ymm12 + vbroadcastss ymm13, xmm13 + vbroadcastss ymm14, xmm14 + vbroadcastss ymm15, xmm15 + 2: + vmovaps ymm4, [{ptr}] + vmovaps ymm5, [{ptr} + 32] + vmovaps ymm6, [{ptr} + 64] + vmovaps ymm7, [{ptr} + 96] + + vsubps ymm4, ymm4, ymm13 + vsubps ymm5, ymm5, ymm13 + vsubps ymm6, ymm6, ymm13 + vsubps ymm7, ymm7, ymm13 + + vmovaps ymm8, ymm15 + vmovaps ymm9, ymm15 + vmovaps ymm10, ymm15 + vmovaps ymm11, ymm15 + + vfmadd231ps ymm8, ymm4, ymm14 + vfmadd231ps ymm9, ymm5, ymm14 + vfmadd231ps ymm10, ymm6, ymm14 + vfmadd231ps ymm11, ymm7, ymm14 + + vmaxps ymm8, ymm8, ymm12 + vmaxps ymm9, ymm9, ymm12 + vmaxps ymm10, ymm10, ymm12 + vmaxps ymm11, ymm11, ymm12 + + vcvttps2dq ymm8, ymm8 + vcvttps2dq ymm9, ymm9 + vcvttps2dq ymm10, ymm10 + vcvttps2dq ymm11, ymm11 + + vmovaps [{ptr}] , ymm8 + vmovaps [{ptr} + 32], ymm9 + vmovaps [{ptr} + 64], ymm10 + vmovaps [{ptr} + 96], ymm11 + + vaddps ymm0, ymm0, ymm8 + vaddps ymm1, ymm1, ymm9 + vaddps ymm2, ymm2, ymm10 + vaddps ymm3, ymm3, ymm11 + + add {ptr}, 128 + sub {len}, 32 + jnz 2b + + vaddps ymm0, ymm0, ymm1 + vaddps ymm2, ymm2, ymm3 + vaddps ymm0, ymm0, ymm2 + vperm2f128 ymm1, ymm0, ymm0, 1 + vaddps xmm0, xmm0, xmm1 + vpermilps xmm1, xmm0, 2 + (3 << 2) + vaddps xmm0, xmm0, xmm1 + vpermilps xmm1, xmm0, 1 + vaddps xmm0, xmm0, xmm1 + ", + len = inout(reg) len => _, + ptr = inout(reg) ptr => _, + inout("ymm0") acc, + out("ymm1") _, out("ymm2") _, out("ymm3") _, + out("ymm4") _, out("ymm5") _, out("ymm6") _, out("ymm7") _, + out("ymm8") _, out("ymm9") _, out("ymm10") _, out("ymm11") _, + out("ymm12") _, + inout("ymm13") max => _, + inout("ymm14") SLOPE => _, + inout("ymm15") OFFSET => _, + ); + acc + } +} + +#[cfg(test)] +mod test_x86_64_fma_softmax2_fastcompact_f32_32n { + use super::*; + crate::softmax_l2_frame_tests!( + is_x86_feature_detected!("fma"), + f32, + x86_64_fma_softmax2_fastcompact_f32_32n + ); +} diff --git a/vendor/tract-linalg-0.22.1/tests/virtual_im2col.rs b/vendor/tract-linalg-0.22.1/tests/virtual_im2col.rs new file mode 100644 index 000000000..095ce1844 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/tests/virtual_im2col.rs @@ -0,0 +1,545 @@ +use std::alloc::Layout; +use std::fmt::Display; + +use proptest::arbitrary::Arbitrary; +use proptest::prelude::*; +use proptest::strategy::{BoxedStrategy, Strategy}; +use tract_data::internal::*; +use tract_linalg::mmm::FusedSpec; +use tract_linalg::mmm::{AsInputValue, EagerPackedInput, MMMInputFormat, MMMInputValue}; +use tract_linalg::pack::{PackedFormat, PackingWriter}; +use tract_linalg::WeightType; +use DatumType::F32; + +proptest::proptest! { + #[test] + fn prop(pb in any::()) { + pb.check() + } +} + +#[test] +fn test1() { + ConvProblem { + lazy_im2col: false, + input: tensor3(&[[[1f32]]]), + filters: tensor4(&[[[[-1f32]]]]), + } + .check() +} + +#[test] +fn test_axes_0() { + // CHW HWIO CHW + // 121 1112 221 + ConvProblem { + lazy_im2col: false, + input: tensor3(&[[[0f32], [-1.0]]]), + filters: tensor4(&[[[[0f32, -1f32]]]]), + } + .check() +} + +#[test] +fn test_axes_1() { + ConvProblem { + lazy_im2col: false, + input: tensor3(&[[[0f32, 1.]]]), + filters: tensor4(&[[[[1f32]]]]), + } + .check() +} + +#[test] +fn test_lazy_0() { + ConvProblem { lazy_im2col: true, input: tensor3(&[[[1f32]]]), filters: tensor4(&[[[[1f32]]]]) } + .check() +} + +#[test] +fn test_lazy_1() { + ConvProblem { + lazy_im2col: true, + input: tensor3(&[[[0f32], [0.], [0.]]]), + filters: tensor4(&[[[[0f32]]]]), + } + .check() +} + +#[test] +fn test_lazy_2() { + ConvProblem { + lazy_im2col: true, + input: tensor3(&[[[0f32, 0.], [0., 1.]]]), + filters: tensor4(&[[[[0f32]], [[1.]]]]), + } + .check() +} + +#[test] +fn test_lazy_3() { + // CHW HWIO CHW + // 212 1221 111 + // im2col: k=4, n=1, k <- kh, kw, c + // 0 X X X X kh=0, kw=0, c=0 + // 1 X X X X kh=0, kw=0, c=1 + // 0 X X X X kh=0, kw=1, c=0 + // 0 X X X X kh=0, kw=1, c=1 + ConvProblem { + lazy_im2col: true, + input: tensor3(&[[[0f32, 0.]], [[1., 0.]]]), + filters: tensor4(&[[[[0f32], [0.]], [[1.], [0.]]]]), + } + .check() +} + +#[test] +fn test_eager_asan_0() { + ConvProblem { + lazy_im2col: false, + input: tensor(vec![3, 3, 5]), + filters: tensor(vec![3, 3, 3, 1]), + } + .check() +} + +// 2D valid, no group, no dil, no stride, HWIO, CHW +#[derive(Clone, Debug)] +pub struct ConvProblem { + pub lazy_im2col: bool, + pub input: Tensor, + pub filters: Tensor, +} + +fn mknhw(filters: &[usize], input: &[usize]) -> (usize, usize, usize, usize, usize) { + let m = filters[3]; + let k = filters[0..3].iter().product::(); + let h = input[1] - filters[0] + 1; + let w = input[2] - filters[1] + 1; + let n = h * w; + (m, k, n, h, w) +} + +impl ConvProblem { + fn reference(&self) -> Tensor { + let (m, _, _, h, w) = mknhw(self.filters.shape(), self.input.shape()); + let output_shape = [m, h, w]; + let mut output = Tensor::zero::(&output_shape).unwrap(); + let mut output_view = output.to_array_view_mut::().unwrap(); + let input_view = self.input.to_array_view::().unwrap(); + let filters_view = self.filters.to_array_view::().unwrap(); + for geo_out in tract_ndarray::indices(&output_shape[1..]) { + for ker_geo in tract_ndarray::indices(&self.filters.shape()[0..2]) { + for ci in 0..self.filters.shape()[2] { + for co in 0..self.filters.shape()[3] { + let output_coord = [co, geo_out[0], geo_out[1]]; + let input_coord = [ci, geo_out[0] + ker_geo[0], geo_out[1] + ker_geo[1]]; + let ker_coord = [ker_geo[0], ker_geo[1], ci, co]; + output_view[output_coord] += + filters_view[ker_coord] * input_view[input_coord]; + } + } + } + } + output + } + + pub fn tract(&self) -> TractResult { + let (m, k, n, h, w) = mknhw(self.filters.shape(), self.input.shape()); + let output_shape = [m, h, w]; + let internal_output_shape = [m, h * w]; + let mmm = tract_linalg::ops().mmm(F32, Some(m), Some(k), Some(n)).unwrap(); + let output = Tensor::zero::(&internal_output_shape)?; + let reshaped_filters = self.filters.clone().into_shape(&[k, m])?; + let (a_pack, b_pack) = &mmm.packings()[0]; + let a = a_pack.prepare_one(&reshaped_filters, 0, 1)?; + unsafe { + let im2col: Box = if self.lazy_im2col { + LazyIm2colSpec { + full_kernel_shape: self.filters.shape().into(), + packer: b_pack.downcast_ref::().unwrap().clone(), + } + .wrap(&self.input.view()) + } else { + EagerIm2colSpec { + full_kernel_shape: self.filters.shape().into(), + packer: b_pack.downcast_ref::().unwrap().clone(), + } + .wrap(&self.input.view()) + }; + let c_store = mmm.c_view(Some(0), Some(1)).wrap(&output.view()); + mmm.run( + m, + n, + &[ + FusedSpec::AddMatMul { + a: AsInputValue::Owned(a), + b: AsInputValue::Owned(im2col), + packing: 0, + }, + FusedSpec::Store(c_store), + ], + ) + .unwrap() + } + output.into_shape(&output_shape) + } + + fn check(&self) { + let expected = self.reference(); + let found = self.tract().unwrap(); + if found.close_enough(&expected, true).is_err() { + println!("found: "); + println!("{:?}", found.to_array_view::().unwrap()); + println!("expected: "); + println!("{:?}", expected.to_array_view::().unwrap()); + } + found.close_enough(&expected, true).unwrap() + } +} + +impl Arbitrary for ConvProblem { + type Parameters = (); + type Strategy = BoxedStrategy; + fn arbitrary_with(_args: Self::Parameters) -> Self::Strategy { + (any::(), 1..4usize, 1..4usize, 1..4usize, 1..4usize, 0..3usize, 0..3usize) + .prop_map(|(eager_im2col, h, w, i, o, extra_h, extra_w)| { + let filters = tensor(vec![h, w, i, o]); + let input = tensor(vec![i, h + extra_h, w + extra_w]); + ConvProblem { lazy_im2col: eager_im2col, filters, input } + }) + .boxed() + } +} + +fn tensor(shape: Vec) -> Tensor { + let mut tensor = Tensor::zero::(&shape).unwrap(); + tensor.as_slice_mut::().unwrap().iter_mut().enumerate().for_each(|(ix, x)| *x = ix as f32); + tensor +} + +#[derive(Clone, Debug, Hash, Eq, PartialEq)] +struct EagerIm2colSpec { + packer: PackedFormat, + full_kernel_shape: TVec, +} + +impl EagerIm2colSpec { + fn wrap(&self, input: &TensorView) -> Box { + let (_, k, n, h, w) = mknhw(&self.full_kernel_shape, input.shape()); + // let input = input.to_array_view::().unwrap(); + let ci = input.shape()[0]; + let kh = self.full_kernel_shape[0]; + let kw = self.full_kernel_shape[1]; + let im2col = tract_ndarray::Array5::::from_shape_fn( + [kh, kw, ci, h, w], + |(kh, kw, ci, h, w)| *input.at([ci, h + kh, w + kw]).unwrap(), + ) + .into_shape_with_order([k, n]) + .unwrap(); + Box::new(EagerIm2col { im2col: im2col.into_tensor(), packer: self.packer.clone(), k }) + } +} + +impl Display for EagerIm2colSpec { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "EagerIm2colSpec") + } +} + +impl MMMInputFormat for EagerIm2colSpec { + fn prepare_tensor(&self, _t: &Tensor, _k_axis: usize, _mn_axis: usize) -> TractResult { + todo!(); + } + + fn precursor(&self) -> WeightType { + WeightType::Plain(f32::datum_type()) + } + + fn k_alignment(&self) -> usize { + 1 + } + + fn r(&self) -> usize { + self.packer.r() + } + + fn same_as(&self, other: &dyn MMMInputFormat) -> bool { + other.downcast_ref::().is_some_and(|other| other == self) + } + + fn mem_size(&self, _k: TDim, _mn: TDim) -> TDim { + unimplemented!() + } + + fn extract_at_mn_f16( + &self, + _data: &EagerPackedInput, + _mn: usize, + _slice: &mut [f16], + ) -> TractResult<()> { + todo!(); + } + + fn extract_at_mn_f32( + &self, + _data: &EagerPackedInput, + _mn: usize, + _slice: &mut [f32], + ) -> TractResult<()> { + todo!(); + } + + fn prepare_one( + &self, + _t: &Tensor, + _k_axis: usize, + _mn_axis: usize, + ) -> TractResult> { + todo!() + } +} + +#[derive(Clone, Debug, Hash)] +struct EagerIm2col { + packer: PackedFormat, + im2col: Tensor, + k: usize, +} + +impl Display for EagerIm2col { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "eager") + } +} + +impl MMMInputValue for EagerIm2col { + fn scratch_panel_buffer_layout(&self) -> Option { + Some( + Layout::from_size_align( + self.packer.single_panel_len(self.k) * f32::datum_type().size_of(), + self.packer.alignment(), + ) + .unwrap(), + ) + } + + fn panel_bytes(&self, i: usize, buffer: Option<*mut u8>) -> TractResult<*const u8> { + let buffer = buffer.unwrap(); + let mn = self.im2col.shape()[1]; + unsafe { + self.packer.pack_t::( + buffer as _, + self.im2col.as_ptr().unwrap(), + mn, + mn as isize, + 1, + 0..self.k, + (i * self.packer.r)..((i + 1) * self.packer.r), + ); + } + Ok(buffer) + } + + fn k(&self) -> usize { + self.k + } + + fn mn(&self) -> usize { + self.im2col.shape()[1] + } + + fn format(&self) -> &dyn tract_linalg::mmm::MMMInputFormat { + &self.packer + } + + fn opaque_fact(&self) -> &dyn OpaqueFact { + unimplemented!() + } + + fn same_as(&self, _other: &dyn MMMInputValue) -> bool { + unimplemented!() + } + + fn extract_at_mn_f16(&self, _mn: usize, _slice: &mut [f16]) -> TractResult<()> { + unimplemented!() + } + + fn extract_at_mn_f32(&self, _mn: usize, _slice: &mut [f32]) -> TractResult<()> { + unimplemented!() + } +} + +#[derive(Clone, Debug, Hash, PartialEq, Eq)] +struct LazyIm2colSpec { + packer: PackedFormat, + full_kernel_shape: TVec, +} + +impl LazyIm2colSpec { + fn wrap(&self, input: &TensorView) -> Box { + let (_, _, _, h, w) = mknhw(&self.full_kernel_shape, input.shape()); + let kh = self.full_kernel_shape[0]; + let kw = self.full_kernel_shape[1]; + let ci = self.full_kernel_shape[2]; + let input_strides = input.strides(); + let k_offsets = (0..kh as isize) + .flat_map(|kh| { + (0..kw as isize).flat_map(move |kw| { + (0..ci as isize).map(move |ci| { + ci * input_strides[0] + kh * input_strides[1] + kw * input_strides[2] + }) + }) + }) + .collect(); + let n_offsets = (0..h as isize) + .flat_map(|h| { + (0..w as isize).map(move |w| (h * input_strides[1] + w * input_strides[2])) + }) + .collect(); + unsafe { + Box::new(LazyIm2col { + spec: self.clone(), + image: input.as_ptr_unchecked(), + k_offsets, + n_offsets, + packer: self.packer.clone(), + }) + } + } +} + +impl Display for LazyIm2colSpec { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "LazyIm2colSpec") + } +} + +impl MMMInputFormat for LazyIm2colSpec { + fn prepare_tensor(&self, _t: &Tensor, _k_axis: usize, _mn_axis: usize) -> TractResult { + todo!(); + } + fn prepare_one( + &self, + _t: &Tensor, + _k_axis: usize, + _mn_axis: usize, + ) -> TractResult> { + todo!(); + } + + fn precursor(&self) -> WeightType { + WeightType::Plain(f32::datum_type()) + } + + fn k_alignment(&self) -> usize { + 1 + } + + fn r(&self) -> usize { + self.packer.r() + } + + fn same_as(&self, other: &dyn MMMInputFormat) -> bool { + other.downcast_ref::().is_some_and(|other| other == self) + } + + fn mem_size(&self, _k: TDim, _mn: TDim) -> TDim { + unimplemented!() + } + + fn extract_at_mn_f16( + &self, + _data: &EagerPackedInput, + _mn: usize, + _slice: &mut [f16], + ) -> TractResult<()> { + todo!(); + } + + fn extract_at_mn_f32( + &self, + _data: &EagerPackedInput, + _mn: usize, + _slice: &mut [f32], + ) -> TractResult<()> { + todo!(); + } +} + +#[derive(Clone, Debug, Hash)] +struct LazyIm2col { + spec: LazyIm2colSpec, + packer: PackedFormat, + image: *const f32, + n_offsets: Vec, + k_offsets: Vec, +} +unsafe impl Send for LazyIm2col {} +unsafe impl Sync for LazyIm2col {} + +impl Display for LazyIm2col { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "lazy") + } +} + +impl MMMInputValue for LazyIm2col { + fn scratch_panel_buffer_layout(&self) -> Option { + Some( + Layout::from_size_align( + self.packer.single_panel_len(self.k_offsets.len() * f32::datum_type().size_of()), + self.packer.alignment(), + ) + .unwrap(), + ) + } + + fn panel_bytes(&self, i: usize, buffer: Option<*mut u8>) -> TractResult<*const u8> { + let buffer = buffer.unwrap() as *mut f32; + let mn_end = ((i + 1) * self.packer.r).min(self.n_offsets.len()); + let n_range = (i * self.packer.r)..mn_end; + let k = self.k_offsets.len(); + unsafe { + let mut writer = self.packer.write_with_k_outer(buffer, k, n_range.len()); + for k in 0..k { + for n in n_range.clone() { + writer.write( + *self.image.offset( + self.n_offsets.get_unchecked(n) + self.k_offsets.get_unchecked(k), + ), + ) + } + } + } + Ok(buffer as _) + } + + fn k(&self) -> usize { + self.k_offsets.len() + } + + fn mn(&self) -> usize { + self.n_offsets.len() + } + + fn format(&self) -> &dyn MMMInputFormat { + &self.spec + } + + fn opaque_fact(&self) -> &dyn OpaqueFact { + unimplemented!() + } + + fn same_as(&self, _other: &dyn MMMInputValue) -> bool { + unimplemented!() + } + + fn extract_at_mn_f16(&self, _mn: usize, _slice: &mut [f16]) -> TractResult<()> { + unimplemented!() + } + + fn extract_at_mn_f32(&self, _mn: usize, _slice: &mut [f32]) -> TractResult<()> { + unimplemented!() + } +} diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli new file mode 100644 index 000000000..857f7821c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli @@ -0,0 +1,59 @@ + // Tile size: 10x1 + // Accumulators: 0-9 + // Col regs: 10-19 + // Row regs: 20, 21 + + vbroadcastss zmm20, dword ptr [rcx] + + vmovaps zmm10, [rax + 0] + vmovaps zmm11, [rax + 64] + vmovaps zmm12, [rax + 128] + vmovaps zmm13, [rax + 192] + vmovaps zmm14, [rax + 256] + + vfmadd231ps zmm0, zmm10, zmm20 + vfmadd231ps zmm1, zmm11, zmm20 + vfmadd231ps zmm2, zmm12, zmm20 + vfmadd231ps zmm3, zmm13, zmm20 + vfmadd231ps zmm4, zmm14, zmm20 + + vmovaps zmm15, [rax + 320] + vmovaps zmm16, [rax + 384] + vmovaps zmm17, [rax + 448] + vmovaps zmm18, [rax + 512] + vmovaps zmm19, [rax + 576] + + vfmadd231ps zmm5, zmm10, zmm20 + vfmadd231ps zmm6, zmm11, zmm20 + vfmadd231ps zmm7, zmm12, zmm20 + vfmadd231ps zmm8, zmm13, zmm20 + vfmadd231ps zmm9, zmm14, zmm20 + + vbroadcastss zmm21, dword ptr [rcx + 4] + + vmovaps zmm10, [rax + 640] + vmovaps zmm11, [rax + 704] + vmovaps zmm12, [rax + 768] + vmovaps zmm13, [rax + 832] + vmovaps zmm14, [rax + 896] + + vfmadd231ps zmm0, zmm10, zmm21 + vfmadd231ps zmm1, zmm11, zmm21 + vfmadd231ps zmm2, zmm12, zmm21 + vfmadd231ps zmm3, zmm13, zmm21 + vfmadd231ps zmm4, zmm14, zmm21 + + vmovaps zmm15, [rax + 960] + vmovaps zmm16, [rax + 1024] + vmovaps zmm17, [rax + 1088] + vmovaps zmm18, [rax + 1152] + vmovaps zmm19, [rax + 1216] + + vfmadd231ps zmm5, zmm10, zmm21 + vfmadd231ps zmm6, zmm11, zmm21 + vfmadd231ps zmm7, zmm12, zmm21 + vfmadd231ps zmm8, zmm13, zmm21 + vfmadd231ps zmm9, zmm14, zmm21 + + add rcx, 8 + add rax, 1280 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 000000000..76aaae5bf --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,33 @@ + // Tile size: 10x1 + // Accumulators: 0-9 + // Col regs: 10-19 + // Row regs: 20 + + vbroadcastss zmm20, dword ptr [rcx] + + vmovaps zmm10, [rax + 0] + vmovaps zmm11, [rax + 64] + vmovaps zmm12, [rax + 128] + vmovaps zmm13, [rax + 192] + vmovaps zmm14, [rax + 256] + + vfmadd231ps zmm0, zmm10, zmm20 + vfmadd231ps zmm1, zmm11, zmm20 + vfmadd231ps zmm2, zmm12, zmm20 + vfmadd231ps zmm3, zmm13, zmm20 + vfmadd231ps zmm4, zmm14, zmm20 + + vmovaps zmm15, [rax + 320] + vmovaps zmm16, [rax + 384] + vmovaps zmm17, [rax + 448] + vmovaps zmm18, [rax + 512] + vmovaps zmm19, [rax + 576] + + vfmadd231ps zmm5, zmm10, zmm20 + vfmadd231ps zmm6, zmm11, zmm20 + vfmadd231ps zmm7, zmm12, zmm20 + vfmadd231ps zmm8, zmm13, zmm20 + vfmadd231ps zmm9, zmm14, zmm20 + + add rcx, 4 + add rax, 320 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 000000000..ba4e6232c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,7 @@ + vbroadcastss zmm15, dword ptr [rcx] + + vmovups zmm8, [rax] + vfmadd231ps zmm0, zmm15, zmm8 + + add rcx, 4 + add rax, 64 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli new file mode 100644 index 000000000..4a1c31083 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli @@ -0,0 +1,68 @@ + vmovups zmm31, [rcx] + // vbroadcastss zmm17, [rcx + 4 * 0] + // vbroadcastss zmm18, [rcx + 4 * 1] + // vbroadcastss zmm19, [rcx + 4 * 2] + // vbroadcastss zmm20, [rcx + 4 * 3] + // vbroadcastss zmm21, [rcx + 4 * 4] + // vbroadcastss zmm22, [rcx + 4 * 5] + // vbroadcastss zmm23, [rcx + 4 * 6] + // vbroadcastss zmm24, [rcx + 4 * 7] + // vbroadcastss zmm25, [rcx + 4 * 8] + // vbroadcastss zmm26, [rcx + 4 * 9] + // vbroadcastss zmm27, [rcx + 4 * 10] + // vbroadcastss zmm28, [rcx + 4 * 11] + // vbroadcastss zmm29, [rcx + 4 * 12] + // vbroadcastss zmm30, [rcx + 4 * 13] + // vbroadcastss zmm31, [rcx + 4 * 14] + + vbroadcastss zmm16, xmm31 + valignd zmm17, zmm31, zmm31, 1 + vbroadcastss zmm17, xmm17 + valignd zmm18, zmm31, zmm31, 2 + vbroadcastss zmm18, xmm18 + valignd zmm19, zmm31, zmm31, 3 + vbroadcastss zmm19, xmm19 + valignd zmm20, zmm31, zmm31, 4 + vbroadcastss zmm20, xmm20 + valignd zmm21, zmm31, zmm31, 5 + vbroadcastss zmm21, xmm21 + valignd zmm22, zmm31, zmm31, 6 + vbroadcastss zmm22, xmm22 + valignd zmm23, zmm31, zmm31, 7 + vbroadcastss zmm23, xmm23 + valignd zmm24, zmm31, zmm31, 8 + vbroadcastss zmm24, xmm24 + valignd zmm25, zmm31, zmm31, 9 + vbroadcastss zmm25, xmm25 + valignd zmm26, zmm31, zmm31, 10 + vbroadcastss zmm26, xmm26 + valignd zmm27, zmm31, zmm31, 11 + vbroadcastss zmm27, xmm27 + valignd zmm28, zmm31, zmm31, 12 + vbroadcastss zmm28, xmm28 + valignd zmm29, zmm31, zmm31, 13 + vbroadcastss zmm29, xmm29 + valignd zmm30, zmm31, zmm31, 14 + vbroadcastss zmm30, xmm30 + valignd zmm31, zmm31, zmm31, 15 + vbroadcastss zmm31, xmm31 + + vfmadd231ps zmm0, zmm16, [rax + 0] + vfmadd231ps zmm1, zmm17, [rax + 64] + vfmadd231ps zmm2, zmm18, [rax + 128] + vfmadd231ps zmm3, zmm19, [rax + 192] + vfmadd231ps zmm4, zmm20, [rax + 256] + vfmadd231ps zmm5, zmm21, [rax + 320] + vfmadd231ps zmm6, zmm22, [rax + 384] + vfmadd231ps zmm7, zmm23, [rax + 448] + vfmadd231ps zmm8, zmm24, [rax + 512] + vfmadd231ps zmm9, zmm25, [rax + 576] + vfmadd231ps zmm10, zmm26, [rax + 640] + vfmadd231ps zmm11, zmm27, [rax + 704] + vfmadd231ps zmm12, zmm28, [rax + 768] + vfmadd231ps zmm13, zmm29, [rax + 832] + vfmadd231ps zmm14, zmm30, [rax + 896] + vfmadd231ps zmm15, zmm31, [rax + 960] + + add rcx, 64 + add rax, 1024 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli new file mode 100644 index 000000000..103be7015 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli @@ -0,0 +1,24 @@ + // slow + vbroadcastss xmm16, dword ptr [rcx] + vbroadcastss xmm17, dword ptr [rcx + 4] + vbroadcastss xmm18, dword ptr [rcx + 8] + vbroadcastss xmm19, dword ptr [rcx + 12] + + // fast + vmovups xmm31, [rcx] + vbroadcastss zmm16, xmm31 + valignd xmm17, xmm31, xmm31, 1 + vbroadcastss zmm17, xmm17 + valignd xmm18, xmm31, xmm31, 2 + vbroadcastss zmm18, xmm18 + valignd xmm19, xmm31, xmm31, 3 + vbroadcastss zmm19, xmm19 + + // commmon + vfmadd231ps zmm0, zmm16, [rax + 0] + vfmadd231ps zmm1, zmm17, [rax + 64] + vfmadd231ps zmm2, zmm18, [rax + 128] + vfmadd231ps zmm3, zmm19, [rax + 192] + + add rcx, 16 + add rax, 256 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli new file mode 100644 index 000000000..d6cb277f8 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli @@ -0,0 +1,29 @@ + vmovups ymm31, [rcx] + + vbroadcastss zmm16, xmm31 + valignd ymm17, ymm31, ymm31, 1 + vbroadcastss zmm17, xmm17 + valignd ymm18, ymm31, ymm31, 2 + vbroadcastss zmm18, xmm18 + valignd ymm19, ymm31, ymm31, 3 + vbroadcastss zmm19, xmm19 + valignd ymm20, ymm31, ymm31, 4 + vbroadcastss zmm20, xmm20 + valignd ymm21, ymm31, ymm31, 5 + vbroadcastss zmm21, xmm21 + valignd ymm22, ymm31, ymm31, 6 + vbroadcastss zmm22, xmm22 + valignd ymm23, ymm31, ymm31, 7 + vbroadcastss zmm23, xmm23 + + vfmadd231ps zmm0, zmm16, [rax + 0] + vfmadd231ps zmm1, zmm17, [rax + 64] + vfmadd231ps zmm2, zmm18, [rax + 128] + vfmadd231ps zmm3, zmm19, [rax + 192] + vfmadd231ps zmm4, zmm20, [rax + 256] + vfmadd231ps zmm5, zmm21, [rax + 320] + vfmadd231ps zmm6, zmm22, [rax + 384] + vfmadd231ps zmm7, zmm23, [rax + 448] + + add rcx, 32 + add rax, 512 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli new file mode 100644 index 000000000..8c9bf905b --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli @@ -0,0 +1,11 @@ + vbroadcastss zmm15, dword ptr [rcx] + + vmovaps zmm8, [rax + 0] + vfmadd231ps zmm0, zmm15, zmm8 + + vbroadcastss zmm16, dword ptr [rcx + 4] + vmovaps zmm9, [rax + 64] + vfmadd231ps zmm1, zmm16, zmm9 + + add rcx, 8 + add rax, 128 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 000000000..4ffab3bd4 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,45 @@ + // Tile size: 1x12 + // Accumulators: 0-11 + // Col regs: zmm14 + // Row regs: zmm15 + + vmovaps zmm15, [rax] + + vbroadcastss zmm14, dword ptr [rcx + 0 * 4] + vfmadd231ps zmm0, zmm15, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 1 * 4] + vfmadd231ps zmm1, zmm15, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 2 * 4] + vfmadd231ps zmm2, zmm15, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 3 * 4] + vfmadd231ps zmm3, zmm15, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 4 * 4] + vfmadd231ps zmm4, zmm15, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 5 * 4] + vfmadd231ps zmm5, zmm15, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 6 * 4] + vfmadd231ps zmm6, zmm15, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 7 * 4] + vfmadd231ps zmm7, zmm15, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 8 * 4] + vfmadd231ps zmm8, zmm15, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 9 * 4] + vfmadd231ps zmm9, zmm15, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 10 * 4] + vfmadd231ps zmm10, zmm15, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 11 * 4] + vfmadd231ps zmm11, zmm15, zmm14 + + add rcx, 48 + add rax, 64 \ No newline at end of file diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli new file mode 100644 index 000000000..118d312c8 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli @@ -0,0 +1,53 @@ + // Accumulators: 0-9 + // Columns: 15-16 + // Rows: 10-14 + vbroadcastss zmm10, dword ptr [rcx] + vbroadcastss zmm11, dword ptr [rcx + 4] + vbroadcastss zmm12, dword ptr [rcx + 8] + vbroadcastss zmm13, dword ptr [rcx + 12] + vbroadcastss zmm14, dword ptr [rcx + 16] + + vmovaps zmm15, [rax] + vmovaps zmm16, [rax + 64] + + vfmadd231ps zmm0, zmm15, zmm10 + vfmadd231ps zmm1, zmm16, zmm10 + + vfmadd231ps zmm2, zmm15, zmm11 + vfmadd231ps zmm3, zmm16, zmm11 + + vfmadd231ps zmm4, zmm15, zmm12 + vfmadd231ps zmm5, zmm16, zmm12 + + vfmadd231ps zmm6, zmm15, zmm13 + vfmadd231ps zmm7, zmm16, zmm13 + + vfmadd231ps zmm8, zmm15, zmm14 + vfmadd231ps zmm9, zmm16, zmm14 + + vbroadcastss zmm10, dword ptr [rcx + 20] + vbroadcastss zmm11, dword ptr [rcx + 24] + vbroadcastss zmm12, dword ptr [rcx + 28] + vbroadcastss zmm13, dword ptr [rcx + 32] + vbroadcastss zmm14, dword ptr [rcx + 36] + + vmovaps zmm15, [rax + 128] + vmovaps zmm16, [rax + 192] + + vfmadd231ps zmm0, zmm15, zmm10 + vfmadd231ps zmm1, zmm16, zmm10 + + vfmadd231ps zmm2, zmm15, zmm11 + vfmadd231ps zmm3, zmm16, zmm11 + + vfmadd231ps zmm4, zmm15, zmm12 + vfmadd231ps zmm5, zmm16, zmm12 + + vfmadd231ps zmm6, zmm15, zmm13 + vfmadd231ps zmm7, zmm16, zmm13 + + vfmadd231ps zmm8, zmm15, zmm14 + vfmadd231ps zmm9, zmm16, zmm14 + + add rcx, 40 + add rax, 256 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 000000000..e017834d2 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,30 @@ + // Accumulators: 0-9 + // Columns: 15 + // Rows: 10-14 + + vbroadcastss zmm10, dword ptr [rcx] + vbroadcastss zmm11, dword ptr [rcx + 4] + vbroadcastss zmm12, dword ptr [rcx + 8] + vbroadcastss zmm13, dword ptr [rcx + 12] + vbroadcastss zmm14, dword ptr [rcx + 16] + + vmovaps zmm15, [rax] + vmovaps zmm16, [rax + 64] + + vfmadd231ps zmm0, zmm15, zmm10 + vfmadd231ps zmm1, zmm16, zmm10 + + vfmadd231ps zmm2, zmm15, zmm11 + vfmadd231ps zmm3, zmm16, zmm11 + + vfmadd231ps zmm4, zmm15, zmm12 + vfmadd231ps zmm5, zmm16, zmm12 + + vfmadd231ps zmm6, zmm15, zmm13 + vfmadd231ps zmm7, zmm16, zmm13 + + vfmadd231ps zmm8, zmm15, zmm14 + vfmadd231ps zmm9, zmm16, zmm14 + + add rcx, 20 + add rax, 128 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli new file mode 100644 index 000000000..9d6c940a9 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli @@ -0,0 +1,71 @@ + // Tile size: 2x6 + // Accumulators: 0-11 + // Col regs: zmm14-15 + // Row regs: zmm12-13 + + vbroadcastss zmm14, dword ptr [rcx] + vmovaps zmm12, [rax] + vmovaps zmm13, [rax + 64] + vbroadcastss zmm15, dword ptr [rcx + 4] + + vfmadd231ps zmm0, zmm12, zmm14 + vfmadd231ps zmm1, zmm13, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 8] + + vfmadd231ps zmm2, zmm12, zmm15 + vfmadd231ps zmm3, zmm13, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 12] + + vfmadd231ps zmm4, zmm12, zmm14 + vfmadd231ps zmm5, zmm13, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 16] + + vfmadd231ps zmm6, zmm12, zmm15 + vfmadd231ps zmm7, zmm13, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 20] + + vfmadd231ps zmm8, zmm12, zmm14 + vfmadd231ps zmm9, zmm13, zmm14 + + vbroadcastss zmm14, dword ptr [rcx+24] + + vfmadd231ps zmm10, zmm12, zmm15 + vfmadd231ps zmm11, zmm13, zmm15 + + // Iteration two + vmovaps zmm12, [rax + 128] + vmovaps zmm13, [rax + 192] + vbroadcastss zmm15, dword ptr [rcx + 24 + 4] + + vfmadd231ps zmm0, zmm12, zmm14 + vfmadd231ps zmm1, zmm13, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 24 + 8] + + vfmadd231ps zmm2, zmm12, zmm15 + vfmadd231ps zmm3, zmm13, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 24 + 12] + + vfmadd231ps zmm4, zmm12, zmm14 + vfmadd231ps zmm5, zmm13, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 24 + 16] + + vfmadd231ps zmm6, zmm12, zmm15 + vfmadd231ps zmm7, zmm13, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 24 + 20] + + vfmadd231ps zmm8, zmm12, zmm14 + vfmadd231ps zmm9, zmm13, zmm14 + + vfmadd231ps zmm10, zmm12, zmm15 + vfmadd231ps zmm11, zmm13, zmm15 + + add rax, 256 + add rcx, 48 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 000000000..31f861b10 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,39 @@ + // Tile size: 2x6 + // Accumulators: 0-11 + // Col regs: zmm14-15 + // Row regs: zmm12-13 + + // Load ordered by earliest use for first 2x2 block + vbroadcastss zmm14, dword ptr [rcx] + vmovaps zmm12, [rax] + vmovaps zmm13, [rax + 64] + vbroadcastss zmm15, dword ptr [rcx + 4] + + vfmadd231ps zmm0, zmm12, zmm14 + vfmadd231ps zmm1, zmm13, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 8] + + vfmadd231ps zmm2, zmm12, zmm15 + vfmadd231ps zmm3, zmm13, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 12] + + vfmadd231ps zmm4, zmm12, zmm14 + vfmadd231ps zmm5, zmm13, zmm14 + + vbroadcastss zmm14, dword ptr [rcx + 16] + + vfmadd231ps zmm6, zmm12, zmm15 + vfmadd231ps zmm7, zmm13, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 20] + + vfmadd231ps zmm8, zmm12, zmm14 + vfmadd231ps zmm9, zmm13, zmm14 + + vfmadd231ps zmm10, zmm12, zmm15 + vfmadd231ps zmm11, zmm13, zmm15 + + add rax, 128 + add rcx, 24 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli new file mode 100644 index 000000000..c36b7f6b6 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli @@ -0,0 +1,63 @@ + // Tile size: 3x4 + // Accumulators: 0-11 + // Col regs: zmm12-14 + // Row regs: zmm15 + + vmovaps zmm12, [rax] + vmovaps zmm13, [rax+64] + vmovaps zmm14, [rax+128] + + vbroadcastss zmm15, dword ptr [rcx + 0] + + vfmadd231ps zmm0, zmm12, zmm15 + vfmadd231ps zmm1, zmm13, zmm15 + vfmadd231ps zmm2, zmm14, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 4] + + vfmadd231ps zmm3, zmm12, zmm15 + vfmadd231ps zmm4, zmm13, zmm15 + vfmadd231ps zmm5, zmm14, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 8] + + vfmadd231ps zmm6, zmm12, zmm15 + vfmadd231ps zmm7, zmm13, zmm15 + vfmadd231ps zmm8, zmm14, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 12] + + vfmadd231ps zmm9, zmm12, zmm15 + vfmadd231ps zmm10, zmm13, zmm15 + vfmadd231ps zmm11, zmm14, zmm15 + + vmovaps zmm12, [rax + 192] + vmovaps zmm13, [rax + 256] + vmovaps zmm14, [rax + 320] + + vbroadcastss zmm15, dword ptr [rcx + 16] + + vfmadd231ps zmm0, zmm12, zmm15 + vfmadd231ps zmm1, zmm13, zmm15 + vfmadd231ps zmm2, zmm14, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 20] + + vfmadd231ps zmm3, zmm12, zmm15 + vfmadd231ps zmm4, zmm13, zmm15 + vfmadd231ps zmm5, zmm14, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 24] + + vfmadd231ps zmm6, zmm12, zmm15 + vfmadd231ps zmm7, zmm13, zmm15 + vfmadd231ps zmm8, zmm14, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 28] + + vfmadd231ps zmm9, zmm12, zmm15 + vfmadd231ps zmm10, zmm13, zmm15 + vfmadd231ps zmm11, zmm14, zmm15 + + add rax, 384 + add rcx, 32 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 000000000..a8b1c3221 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,35 @@ + // Tile size: 3x4 + // Accumulators: 0-11 + // Col regs: zmm12-14 + // Row regs: zmm15 + + vmovaps zmm12, [rax] + vmovaps zmm13, [rax+64] + vmovaps zmm14, [rax+128] + + vbroadcastss zmm15, dword ptr [rcx + 0] + + vfmadd231ps zmm0, zmm12, zmm15 + vfmadd231ps zmm1, zmm13, zmm15 + vfmadd231ps zmm2, zmm14, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 4] + + vfmadd231ps zmm3, zmm12, zmm15 + vfmadd231ps zmm4, zmm13, zmm15 + vfmadd231ps zmm5, zmm14, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 8] + + vfmadd231ps zmm6, zmm12, zmm15 + vfmadd231ps zmm7, zmm13, zmm15 + vfmadd231ps zmm8, zmm14, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 12] + + vfmadd231ps zmm9, zmm12, zmm15 + vfmadd231ps zmm10, zmm13, zmm15 + vfmadd231ps zmm11, zmm14, zmm15 + + add rax, 192 + add rcx, 16 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli new file mode 100644 index 000000000..fe661b7fa --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli @@ -0,0 +1,69 @@ + // Tile size: 4x3 + // Accumulators: 0-11 + // Col regs: zmm12 + // Row regs: zmm13-15 + + // Load col of A + vmovaps zmm12, [rax] + + // Fill 3 cols of B + vbroadcastss zmm13, dword ptr [rcx + 0] + vbroadcastss zmm14, dword ptr [rcx + 4] + vbroadcastss zmm15, dword ptr [rcx + 8] + + // N.B. Stepping cols in inner loop + vfmadd231ps zmm0, zmm12, zmm13 + vfmadd231ps zmm4, zmm12, zmm14 + vfmadd231ps zmm8, zmm12, zmm15 + + vmovaps zmm12, [rax+64] + + vfmadd231ps zmm1, zmm12, zmm13 + vfmadd231ps zmm5, zmm12, zmm14 + vfmadd231ps zmm9, zmm12, zmm15 + + vmovaps zmm12, [rax+128] + + vfmadd231ps zmm2, zmm12, zmm13 + vfmadd231ps zmm6, zmm12, zmm14 + vfmadd231ps zmm10, zmm12, zmm15 + + vmovaps zmm12, [rax+192] + + vfmadd231ps zmm3, zmm12, zmm13 + vfmadd231ps zmm7, zmm12, zmm14 + vfmadd231ps zmm11, zmm12, zmm15 + + // Load col of A, switching col! + vmovaps zmm13, [rax + 256] + + // Fill 3 cols of B + vbroadcastss zmm14, dword ptr [rcx + 12] + vbroadcastss zmm15, dword ptr [rcx + 16] + vbroadcastss zmm12, dword ptr [rcx + 20] + + // N.B. Stepping cols in inner loop + vfmadd231ps zmm0, zmm13, zmm14 + vfmadd231ps zmm4, zmm13, zmm15 + vfmadd231ps zmm8, zmm13, zmm12 + + vmovaps zmm13, [rax + 320] + + vfmadd231ps zmm1, zmm13, zmm14 + vfmadd231ps zmm5, zmm13, zmm15 + vfmadd231ps zmm9, zmm13, zmm12 + + vmovaps zmm13, [rax + 384] + + vfmadd231ps zmm2, zmm13, zmm14 + vfmadd231ps zmm6, zmm13, zmm15 + vfmadd231ps zmm10, zmm13, zmm12 + + vmovaps zmm13, [rax + 448] + + vfmadd231ps zmm3, zmm13, zmm14 + vfmadd231ps zmm7, zmm13, zmm15 + vfmadd231ps zmm11, zmm13, zmm12 + + add rcx, 24 + add rax, 512 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 000000000..0e71a747e --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,38 @@ + // Tile size: 4x3 + // Accumulators: 0-11 + // Col regs: zmm12 + // Row regs: zmm13-15 + + // Load col of A + vmovaps zmm12, [rax] + + // Fill 3 cols of B + vbroadcastss zmm13, dword ptr [rcx + 0] + vbroadcastss zmm14, dword ptr [rcx + 4] + vbroadcastss zmm15, dword ptr [rcx + 8] + + // N.B. Stepping cols in inner loop + vfmadd231ps zmm0, zmm12, zmm13 + vfmadd231ps zmm4, zmm12, zmm14 + vfmadd231ps zmm8, zmm12, zmm15 + + vmovaps zmm12, [rax+64] + + vfmadd231ps zmm1, zmm12, zmm13 + vfmadd231ps zmm5, zmm12, zmm14 + vfmadd231ps zmm9, zmm12, zmm15 + + vmovaps zmm12, [rax+128] + + vfmadd231ps zmm2, zmm12, zmm13 + vfmadd231ps zmm6, zmm12, zmm14 + vfmadd231ps zmm10, zmm12, zmm15 + + vmovaps zmm12, [rax+192] + + vfmadd231ps zmm3, zmm12, zmm13 + vfmadd231ps zmm7, zmm12, zmm14 + vfmadd231ps zmm11, zmm12, zmm15 + + add rcx, 12 + add rax, 256 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli new file mode 100644 index 000000000..6a5b887b8 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli @@ -0,0 +1,63 @@ + // Tile size: 5x2 + // Accumulators: 0-9 + // Col regs: zmm10-13 + // Row regs: zmm14-15 + + vmovaps zmm10, [rax] + vbroadcastss zmm14, dword ptr [rcx + 0] + vbroadcastss zmm15, dword ptr [rcx + 4] + vmovaps zmm11, [rax + 64] + + // NB stepping column-wise + vfmadd231ps zmm0, zmm10, zmm14 + vfmadd231ps zmm5, zmm10, zmm15 + + vmovaps zmm12, [rax + 128] + + vfmadd231ps zmm1, zmm11, zmm14 + vfmadd231ps zmm6, zmm11, zmm15 + + vmovaps zmm13, [rax + 192] + + vfmadd231ps zmm2, zmm12, zmm14 + vfmadd231ps zmm7, zmm12, zmm15 + + vmovaps zmm10, [rax + 256] + + vfmadd231ps zmm3, zmm13, zmm14 + vfmadd231ps zmm8, zmm13, zmm15 + + vmovaps zmm11, [rax + 320] + + vfmadd231ps zmm4, zmm10, zmm14 + vfmadd231ps zmm9, zmm10, zmm15 + + vbroadcastss zmm14, dword ptr [rcx + 8] + vbroadcastss zmm15, dword ptr [rcx + 12] + + vmovaps zmm12, [rax + 384] + + // NB stepping column-wise + vfmadd231ps zmm0, zmm11, zmm14 + vfmadd231ps zmm5, zmm11, zmm15 + + vmovaps zmm13, [rax + 448] + + vfmadd231ps zmm1, zmm12, zmm14 + vfmadd231ps zmm6, zmm12, zmm15 + + vmovaps zmm10, [rax + 512] + + vfmadd231ps zmm2, zmm13, zmm14 + vfmadd231ps zmm7, zmm13, zmm15 + + vmovaps zmm11, [rax + 576] + + vfmadd231ps zmm3, zmm10, zmm14 + vfmadd231ps zmm8, zmm10, zmm15 + + vfmadd231ps zmm4, zmm11, zmm14 + vfmadd231ps zmm9, zmm11, zmm15 + + add rax, 640 + add rcx, 16 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 000000000..73ef89b58 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,34 @@ + // Tile size: 5x2 + // Accumulators: 0-9 + // Col regs: zmm10-14 + // Row regs: zmm15-16 + + vmovaps zmm10, [rax] + vbroadcastss zmm15, dword ptr [rcx + 0] + vbroadcastss zmm16, dword ptr [rcx + 4] + vmovaps zmm11, [rax + 64] + + // NB stepping column-wise + vfmadd231ps zmm0, zmm10, zmm15 + vfmadd231ps zmm5, zmm10, zmm16 + + vmovaps zmm12, [rax + 128] + + vfmadd231ps zmm1, zmm11, zmm15 + vfmadd231ps zmm6, zmm11, zmm16 + + vmovaps zmm13, [rax + 192] + + vfmadd231ps zmm2, zmm12, zmm15 + vfmadd231ps zmm7, zmm12, zmm16 + + vmovaps zmm14, [rax + 256] + + vfmadd231ps zmm3, zmm13, zmm15 + vfmadd231ps zmm8, zmm13, zmm16 + + vfmadd231ps zmm4, zmm14, zmm15 + vfmadd231ps zmm9, zmm14, zmm16 + + add rax, 320 + add rcx, 8 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli new file mode 100644 index 000000000..8c7704433 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli @@ -0,0 +1,25 @@ + // Tile size: 6x1 + // Accumulators: 0-5 + // Col regs: 6-11 + // Row regs: 15 + + + vbroadcastss zmm15, dword ptr [rcx] + vfmadd231ps zmm0, zmm15, [rax] + vfmadd231ps zmm1, zmm15, [rax + 64] + vfmadd231ps zmm2, zmm15, [rax + 128] + vfmadd231ps zmm3, zmm15, [rax + 192] + vfmadd231ps zmm4, zmm15, [rax + 256] + vfmadd231ps zmm5, zmm15, [rax + 320] + + vbroadcastss zmm14, dword ptr [rcx + 4] + + vfmadd231ps zmm0, zmm14, [rax + 384] + vfmadd231ps zmm1, zmm14, [rax + 448] + vfmadd231ps zmm2, zmm14, [rax + 512] + vfmadd231ps zmm3, zmm14, [rax + 576] + vfmadd231ps zmm4, zmm14, [rax + 640] + vfmadd231ps zmm5, zmm14, [rax + 704] + + add rax, 768 + add rcx, 8 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 000000000..a34c40fee --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,29 @@ + // Tile size: 6x1 + // Accumulators: 0-5 + // Col regs: 6-11 + // Row regs: 15 + + vbroadcastss zmm15, dword ptr [rcx] + + vmovups zmm10, [rax] + vmulps zmm10, zmm10, zmm15 + vaddps zmm0, zmm0, zmm10 + vmovups zmm11, [rax + 64] + vmulps zmm11, zmm11, zmm15 + vaddps zmm1, zmm1, zmm11 + vmovups zmm12, [rax + 128] + vmulps zmm12, zmm12, zmm15 + vaddps zmm2, zmm2, zmm12 + vmovups zmm13, [rax + 192] + vmulps zmm13, zmm13, zmm15 + vaddps zmm3, zmm3, zmm13 + vmovups zmm14, [rax + 256] + vmulps zmm14, zmm14, zmm15 + vaddps zmm4, zmm4, zmm14 + vmovups zmm15, [rax + 320] + vmulps zmm15, zmm15, zmm15 + vaddps zmm5, zmm5, zmm15 + + + add rcx, 4 + add rax, 384 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli new file mode 100644 index 000000000..58ed8f433 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli @@ -0,0 +1,70 @@ + // Tile size: 6x2 + // Accumulators: 0-9 + // Col regs: zmm10-13 + // Row regs: zmm14-15 + + vmovaps zmm12, [rax] + vbroadcastss zmm14, dword ptr [rcx + 0] + vbroadcastss zmm15, dword ptr [rcx + 4] + vmovaps zmm13, [rax + 64] + + vfmadd231ps zmm0, zmm12, zmm14 + vfmadd231ps zmm6, zmm12, zmm15 + + vmovaps zmm12, [rax + 128] + + vfmadd231ps zmm1, zmm13, zmm14 + vfmadd231ps zmm7, zmm13, zmm15 + + vmovaps zmm13, [rax + 192] + + vfmadd231ps zmm2, zmm12, zmm14 + vfmadd231ps zmm8, zmm12, zmm15 + + vmovaps zmm12, [rax + 256] + + vfmadd231ps zmm3, zmm13, zmm14 + vfmadd231ps zmm9, zmm13, zmm15 + + vmovaps zmm13, [rax + 320] + + vfmadd231ps zmm4, zmm12, zmm14 + vfmadd231ps zmm10, zmm12, zmm15 + + vmovaps zmm12, [rax + 384] + vbroadcastss zmm14, dword ptr [rcx + 8] + + vfmadd231ps zmm5, zmm13, zmm14 + vfmadd231ps zmm11, zmm13, zmm15 + + vbroadcastss zmm15, dword ptr [rcx + 12] + vmovaps zmm13, [rax + 448] + + vfmadd231ps zmm0, zmm12, zmm14 + vfmadd231ps zmm6, zmm12, zmm15 + + vmovaps zmm12, [rax + 512] + + vfmadd231ps zmm1, zmm13, zmm14 + vfmadd231ps zmm7, zmm13, zmm15 + + vmovaps zmm13, [rax + 576] + + vfmadd231ps zmm2, zmm12, zmm14 + vfmadd231ps zmm8, zmm12, zmm15 + + vmovaps zmm12, [rax + 640] + + vfmadd231ps zmm3, zmm13, zmm14 + vfmadd231ps zmm9, zmm13, zmm15 + + vmovaps zmm13, [rax + 704] + + vfmadd231ps zmm4, zmm12, zmm14 + vfmadd231ps zmm10, zmm12, zmm15 + + vfmadd231ps zmm5, zmm13, zmm14 + vfmadd231ps zmm11, zmm13, zmm15 + + add rax, 768 + add rcx, 16 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 000000000..0fa5fa8e4 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,38 @@ + // Tile size: 6x2 + // Accumulators: 0-11 + // Col regs: 12-13 + // Row regs: 14-15 + + vmovaps zmm12, [rax] + vbroadcastss zmm14, dword ptr [rcx + 0] + vbroadcastss zmm15, dword ptr [rcx + 4] + vmovaps zmm13, [rax + 64] + + vfmadd231ps zmm0, zmm12, zmm14 + vfmadd231ps zmm6, zmm12, zmm15 + + vmovaps zmm12, [rax + 128] + + vfmadd231ps zmm1, zmm13, zmm14 + vfmadd231ps zmm7, zmm13, zmm15 + + vmovaps zmm13, [rax + 192] + + vfmadd231ps zmm2, zmm12, zmm14 + vfmadd231ps zmm8, zmm12, zmm15 + + vmovaps zmm12, [rax + 256] + + vfmadd231ps zmm3, zmm13, zmm14 + vfmadd231ps zmm9, zmm13, zmm15 + + vmovaps zmm13, [rax + 320] + + vfmadd231ps zmm4, zmm12, zmm14 + vfmadd231ps zmm10, zmm12, zmm15 + + vfmadd231ps zmm5, zmm13, zmm14 + vfmadd231ps zmm11, zmm13, zmm15 + + add rcx, 8 + add rax, 384 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli new file mode 100644 index 000000000..e23d79d2d --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli @@ -0,0 +1,40 @@ + // Tile size: 6x1 + // Accumulators: 0-5 + // Col regs: 6-11 + // Row regs: 15 + + vbroadcastss zmm15, dword ptr [rcx] + + vmovaps zmm7, [rax + 0] + vmovaps zmm8, [rax + 64] + vmovaps zmm9, [rax + 128] + vmovaps zmm10, [rax + 192] + vmovaps zmm11, [rax + 256] + vmovaps zmm12, [rax + 320] + vmovaps zmm13, [rax + 384] + + vfmadd231ps zmm0, zmm7, zmm15 + vfmadd231ps zmm1, zmm8, zmm15 + vfmadd231ps zmm2, zmm9, zmm15 + vfmadd231ps zmm3, zmm10, zmm15 + vfmadd231ps zmm4, zmm11, zmm15 + vfmadd231ps zmm5, zmm12, zmm15 + vfmadd231ps zmm6, zmm13, zmm15 + + vbroadcastss zmm16, dword ptr [rcx + 4] + + vmovaps zmm7, [rax + 448 + 0] + vmovaps zmm8, [rax + 448 + 64] + vmovaps zmm9, [rax + 448 + 128] + vmovaps zmm10, [rax + 448 + 192] + vmovaps zmm11, [rax + 448 + 256] + vmovaps zmm12, [rax + 448 + 320] + vmovaps zmm13, [rax + 448 + 384] + + vfmadd231ps zmm0, zmm7, zmm15 + vfmadd231ps zmm1, zmm8, zmm15 + vfmadd231ps zmm2, zmm9, zmm15 + vfmadd231ps zmm3, zmm10, zmm15 + vfmadd231ps zmm4, zmm11, zmm15 + vfmadd231ps zmm5, zmm12, zmm15 + vfmadd231ps zmm6, zmm13, zmm15 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 000000000..889cb34e9 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,21 @@ + // Tile size: 7x1 + // Accumulators: 0-6 + // Col regs: 6-13 + // Row regs: 15 + vbroadcastss zmm15, dword ptr [rcx] + + vmovaps zmm7, [rax + 0] + vmovaps zmm8, [rax + 64] + vmovaps zmm9, [rax + 128] + vmovaps zmm10, [rax + 192] + vmovaps zmm11, [rax + 256] + vmovaps zmm12, [rax + 320] + vmovaps zmm13, [rax + 384] + + vfmadd231ps zmm0, zmm7, zmm15 + vfmadd231ps zmm1, zmm8, zmm15 + vfmadd231ps zmm2, zmm9, zmm15 + vfmadd231ps zmm3, zmm10, zmm15 + vfmadd231ps zmm4, zmm11, zmm15 + vfmadd231ps zmm5, zmm12, zmm15 + vfmadd231ps zmm6, zmm13, zmm15 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli new file mode 100644 index 000000000..96d0d9863 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli @@ -0,0 +1,30 @@ + // Tile size: 8x1 + // Accumulators: 0-7 + // Col regs: 8-14 + // Row regs: 15 + + vbroadcastss zmm17, dword ptr [rcx] + + + vfmadd231ps zmm0, zmm17, [rax + 0] + vfmadd231ps zmm1, zmm17, [rax + 64] + vfmadd231ps zmm2, zmm17, [rax + 128] + vfmadd231ps zmm3, zmm17, [rax + 192] + vfmadd231ps zmm4, zmm17, [rax + 256] + vfmadd231ps zmm5, zmm17, [rax + 320] + vfmadd231ps zmm6, zmm17, [rax + 384] + vfmadd231ps zmm7, zmm17, [rax + 448] + + vbroadcastss zmm16, dword ptr [rcx + 4] + + vfmadd231ps zmm0, zmm16, [rax + 0 + 512] + vfmadd231ps zmm1, zmm16, [rax + 64 + 512] + vfmadd231ps zmm2, zmm16, [rax + 128 + 512] + vfmadd231ps zmm3, zmm16, [rax + 192 + 512] + vfmadd231ps zmm4, zmm16, [rax + 256 + 512] + vfmadd231ps zmm5, zmm16, [rax + 320 + 512] + vfmadd231ps zmm6, zmm16, [rax + 384 + 512] + vfmadd231ps zmm7, zmm16, [rax + 448 + 512] + + add rcx, 8 + add rax, 1024 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 000000000..38d57ce66 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,25 @@ + // Tile size: 8x1 + // Accumulators: 0-7 + // Col regs: 8-14 + // Row regs: 15 + + vbroadcastss zmm15, dword ptr [rcx] + + vmovaps zmm8, [rax + 0] + vfmadd231ps zmm0, zmm15, zmm8 + vmovaps zmm9, [rax + 64] + vfmadd231ps zmm1, zmm15, zmm9 + vmovaps zmm10, [rax + 128] + vfmadd231ps zmm2, zmm15, zmm10 + vmovaps zmm11, [rax + 192] + vfmadd231ps zmm3, zmm15, zmm11 + vmovaps zmm12, [rax + 256] + vfmadd231ps zmm4, zmm15, zmm12 + vmovaps zmm13, [rax + 320] + vfmadd231ps zmm5, zmm15, zmm13 + vmovaps zmm14, [rax + 384] + vfmadd231ps zmm6, zmm15, zmm14 + vmovaps zmm8, [rax + 448] + vfmadd231ps zmm7, zmm15, zmm8 + add rcx, 4 + add rax, 512 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 000000000..772651ce8 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,42 @@ + // Tile size: 8x2 + // Accumulators: 0-15 + // Col regs: 16-23 + // Row regs: 24-25 + + vmovaps zmm16, [rax + 0] + vbroadcastss zmm24, dword ptr [rcx + 0] + vbroadcastss zmm25, dword ptr [rcx + 4] + + vfmadd231ps zmm0, zmm16, zmm24 + vfmadd231ps zmm8, zmm16, zmm25 + + vmovaps zmm17, [rax + 64] + vfmadd231ps zmm1, zmm17, zmm24 + vfmadd231ps zmm9, zmm17, zmm25 + + vmovaps zmm18, [rax + 128] + vfmadd231ps zmm2, zmm18, zmm24 + vfmadd231ps zmm10, zmm18, zmm25 + + vmovaps zmm19, [rax + 192] + vfmadd231ps zmm3, zmm19, zmm24 + vfmadd231ps zmm11, zmm19, zmm25 + + vmovaps zmm20, [rax + 256] + vfmadd231ps zmm4, zmm20, zmm24 + vfmadd231ps zmm12, zmm20, zmm25 + + vmovaps zmm21, [rax + 320] + vfmadd231ps zmm5, zmm21, zmm24 + vfmadd231ps zmm13, zmm21, zmm25 + + vmovaps zmm22, [rax + 384] + vfmadd231ps zmm6, zmm22, zmm24 + vfmadd231ps zmm14, zmm22, zmm25 + + vmovaps zmm23, [rax + 448] + vfmadd231ps zmm7, zmm23, zmm24 + vfmadd231ps zmm15, zmm23, zmm25 + + add rax, 512 + add rcx, 8 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli new file mode 100644 index 000000000..1400fdf0d --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli @@ -0,0 +1,61 @@ + // Tile size: 1x8 + // Accumulators: 0-7 + // Col regs: 8-14 + // Row regs: 15 + + + vmovaps zmm15, [rax] + + vbroadcastss zmm8, dword ptr [rcx + 0 * 4] + vfmadd231ps zmm0, zmm15, zmm8 + + vbroadcastss zmm9, dword ptr [rcx + 1 * 4] + vfmadd231ps zmm1, zmm15, zmm9 + + vbroadcastss zmm10, dword ptr [rcx + 2 * 4] + vfmadd231ps zmm2, zmm15, zmm10 + + vbroadcastss zmm11, dword ptr [rcx + 3 * 4] + vfmadd231ps zmm3, zmm15, zmm11 + + vbroadcastss zmm12, dword ptr [rcx + 4 * 4] + vfmadd231ps zmm4, zmm15, zmm12 + + vbroadcastss zmm13, dword ptr [rcx + 5 * 4] + vfmadd231ps zmm5, zmm15, zmm13 + + vbroadcastss zmm10, dword ptr [rcx + 6 * 4] + vfmadd231ps zmm6, zmm15, zmm10 + + vbroadcastss zmm11, dword ptr [rcx + 7 * 4] + vfmadd231ps zmm7, zmm15, zmm11 + + + vmovaps zmm15, [rax+64] + + vbroadcastss zmm8, dword ptr [rcx + 8 * 4] + vfmadd231ps zmm0, zmm15, zmm8 + + vbroadcastss zmm9, dword ptr [rcx + 9 * 4] + vfmadd231ps zmm1, zmm15, zmm9 + + vbroadcastss zmm10, dword ptr [rcx + 10 * 4] + vfmadd231ps zmm2, zmm15, zmm10 + + vbroadcastss zmm11, dword ptr [rcx + 11 * 4] + vfmadd231ps zmm3, zmm15, zmm11 + + vbroadcastss zmm12, dword ptr [rcx + 12 * 4] + vfmadd231ps zmm4, zmm15, zmm12 + + vbroadcastss zmm13, dword ptr [rcx + 13 * 4] + vfmadd231ps zmm5, zmm15, zmm13 + + vbroadcastss zmm10, dword ptr [rcx + 14 * 4] + vfmadd231ps zmm6, zmm15, zmm10 + + vbroadcastss zmm11, dword ptr [rcx + 15 * 4] + vfmadd231ps zmm7, zmm15, zmm11 + + add rcx, 64 + add rax, 128 \ No newline at end of file diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 000000000..c08151c2a --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,33 @@ + // Tile size: 1x8 + // Accumulators: 0-7 + // Col regs: 8-14 + // Row regs: 15 + + vmovaps zmm15, [rax] + + vbroadcastss zmm8, dword ptr [rcx + 0 * 4] + vfmadd231ps zmm0, zmm15, zmm8 + + vbroadcastss zmm9, dword ptr [rcx + 1 * 4] + vfmadd231ps zmm1, zmm15, zmm9 + + vbroadcastss zmm10, dword ptr [rcx + 2 * 4] + vfmadd231ps zmm2, zmm15, zmm10 + + vbroadcastss zmm11, dword ptr [rcx + 3 * 4] + vfmadd231ps zmm3, zmm15, zmm11 + + vbroadcastss zmm12, dword ptr [rcx + 4 * 4] + vfmadd231ps zmm4, zmm15, zmm12 + + vbroadcastss zmm13, dword ptr [rcx + 5 * 4] + vfmadd231ps zmm5, zmm15, zmm13 + + vbroadcastss zmm10, dword ptr [rcx + 6 * 4] + vfmadd231ps zmm6, zmm15, zmm10 + + vbroadcastss zmm11, dword ptr [rcx + 7 * 4] + vfmadd231ps zmm7, zmm15, zmm11 + + add rcx, 32 + add rax, 64 diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_128x1.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_128x1.tmpl new file mode 100644 index 000000000..382ae2ca6 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_128x1.tmpl @@ -0,0 +1,110 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 128 x 1 + + zmm0 + zmm1 + ... + zmm7 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" size:"128x1", suffix:suffix, G:G, arch:"avx512" %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rbx, rbx + jz {{L}}non_linear_loop + +{{align}} 16 +{{L}}main_loop_packed_packed: + {% include "8x1/packed_packed_loop1/avx-512.tmpli" %} + + sub rbx, 1 + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +{% include "f32_scalars.tmpliq" from:0, to:7 %} +{% include "f32_per_rows.tmpliq" mr:128, from:0, to:7 %} +{% include "f32_per_cols.tmpliq" mr:128, from:0, to:7 %} +{% include "avx512_mmm_load_tile.tmpliq" from:0, to:7 %} + +{{L}}add_unicast: + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + + {% for row in (0..7) %} + vaddps zmm{{row}}, zmm{{row}}, [ r10 + {{row|times:64}} ] + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vbroadcastss zmm14, dword ptr [rbx] + +{% for i in (0..7) %} + vmovups zmm12, [rax + {{i|times:64}}] + vfmadd231ps zmm{{i}}, zmm12, zmm14 +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + + cmp rsi, 4 + jne {{L}}store_noncontiguous + + test r8, 63 + jnz {{L}}store_unaligned + + {% for row in (0..7) %} + vmovaps [r8 + {{row|times:64}}], zmm{{row}} + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store_unaligned: + {% for row in (0..7) %} + vmovups [r8 + {{row|times:64}}], zmm{{row}} + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store_noncontiguous: + {% for r in (0..7) %} + {% for quarter in (0..3) %} + vextractf32x4 xmm8, zmm{{r}}, {{quarter}} + {% for row in (0..3) %} + vextractps dword ptr [r8], xmm8, {{row}} + add r8, rsi + {% endfor %} + {% endfor %} + {% endfor %} + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" size:"128x1", suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x1.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x1.tmpl new file mode 100644 index 000000000..5f2f57a07 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x1.tmpl @@ -0,0 +1,143 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 16 x 1 + + zmm0 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + + +{% include "preamble.tmpliq" size:"16x1", suffix:suffix, G:G, arch:"avx512" %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rbx, rbx + jz {{L}}non_linear_loop + + cmp rbx, 8 + jl {{L}}main_loop_packed_packed_tail + +{{align}} 16 +{{L}}main_loop_packed_packed: + {% include "1x1/packed_packed_loop1/unroll-4.tmpli" %} + + sub rbx, 4 + cmp rbx, 4 + jge {{L}}main_loop_packed_packed + + {% for r in (1..3) %} + vaddps zmm0, zmm0, zmm{{r}} + {% endfor %} + + test rbx, rbx + jz {{L}}non_linear_loop + +{{align}} 16 +{{L}}main_loop_packed_packed_tail: + {% include "1x1/packed_packed_loop1/avx-512.tmpli" %} + + sub rbx, 1 + jnz {{L}}main_loop_packed_packed_tail + + jmp {{L}}non_linear_loop + +{% include "f32_scalars.tmpliq" from:0, to:0 %} +{% include "f32_per_rows.tmpliq" mr:16, from:0, to:0 %} +{% include "f32_per_cols.tmpliq" mr:16, from:0, to:0 %} +{% include "avx512_mmm_load_tile.tmpliq" from:0, to:0 %} + +{{L}}add_unicast: + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + + cmp rsi, 4 + jne {{L}}add_unicast_generic + + vaddps zmm0, zmm0, [r10] + + jmp {{L}}non_linear_loop + +{{L}}add_unicast_generic: + mov r8, [0] +// mov eax, 0 +// {% for i in (0..3) %} +// pinsrd xmm14, eax, {{i}} +// add eax, esi +// {% endfor %} +// {% for i in (0..3) %} +// pinsrd xmm15, eax, {{i}} +// add eax, esi +// {% endfor %} +// +// vperm2f128 zmm14, zmm14, zmm15, 32 // zmm14 <- xmm14::xmm15 +// +// {% for i in (0..7) %} +// vpcmpeqd zmm15, zmm15, zmm15 +// vgatherdps zmm12, [ r10 + zmm14 ], zmm15 +// +// vaddps zmm{{i}}, zmm{{i}}, zmm12 +// lea r10, [ r10 + rsi * 8 ] +// {% endfor %} +// + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vbroadcastss zmm14, dword ptr [rbx] + +{% for i in (0..0) %} + vmovups zmm12, [rax + {{i|times:64}}] + vfmadd231ps zmm{{i}}, zmm12, zmm14 +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + + cmp rsi, 4 + jne {{L}}store_noncontiguous + + test r8, 63 + jnz {{L}}store_unaligned + + vmovaps [r8], zmm0 + jmp {{L}}non_linear_loop + +{{L}}store_unaligned: + vmovups [r8], zmm0 + jmp {{L}}non_linear_loop + +{{L}}store_noncontiguous: + {% for quarter in (0..3) %} + vextractf32x4 xmm8, zmm0, {{quarter}} + {% for row in (0..3) %} + vextractps dword ptr [r8], xmm8, {{row}} + add r8, rsi + {% endfor %} + {% endfor %} + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" size:"16x1", suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x12.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x12.tmpl new file mode 100644 index 000000000..634454a8b --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x12.tmpl @@ -0,0 +1,165 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 16 x 12 + + zmm0 zmm1 ... zmm11 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + + +{% include "preamble.tmpliq" size:"16x12", suffix:suffix, G:G, arch:"avx512" %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rbx, rbx + jz {{L}}non_linear_loop + +{{align}} 16 +{{L}}main_loop_packed_packed_tail: + {% include "1x12/packed_packed_loop1/avx-512.tmpli" %} + + sub rbx, 1 + jnz {{L}}main_loop_packed_packed_tail + + jmp {{L}}non_linear_loop + +{% include "f32_scalars.tmpliq" from:0, to:11 %} +{% include "f32_per_rows.tmpliq" mr:16, from:0, to:11 %} +{% include "f32_per_cols.tmpliq" mr:16, from:0, to:11 %} +{% include "avx512_mmm_load_tile.tmpliq" from:0, to:11 %} + +{{L}}add_unicast: + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + mov eax, 0 + +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm12, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm13, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 + vinsertf32x8 zmm14, zmm14, ymm13, 1 + +{% for i in (0..11) %} + kxnorw k1,k1,k1 + vgatherdps zmm12{k1}, [ r10 + zmm14 ] + add r10, rbx + vaddps zmm{{i}}, zmm{{i}}, zmm12 +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vmovups zmm12, zmmword ptr [rax] + +{% for i in (0..11) %} + vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] + vfmadd231ps zmm{{i}}, zmm12, zmm14 +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + // tops of cols + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r11, [ r10 + rbx ] + + {% for quarter in (0..3) %} + {% for r in (0..3) %} + vextractf32x4 xmm{{r | plus: 12}}, zmm{{r}}, {{quarter}} + {% endfor %} + {% for row in (0..3) %} + {% for i in (0..3) %} + vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} + add r{{i | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + + mov r8, [rdi + 8] // c ptr + + // tops of cols + lea r8, [ r8 + 4 * rbx ] + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r11, [ r10 + rbx ] + + {% for quarter in (0..3) %} + {% for r in (0..3) %} + vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | plus: 4}}, {{quarter}} + {% endfor %} + {% for row in (0..3) %} + {% for i in (0..3) %} + vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} + add r{{i | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + + mov r8, [rdi + 8] // c ptr + + // tops of cols + lea r8, [ r8 + 8 * rbx ] + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r11, [ r10 + rbx ] + + {% for quarter in (0..3) %} + {% for r in (0..3) %} + vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | plus: 8}}, {{quarter}} + {% endfor %} + {% for row in (0..3) %} + {% for i in (0..3) %} + vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} + add r{{i | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" size:"16x12", suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x8.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x8.tmpl new file mode 100644 index 000000000..69761aaab --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x8.tmpl @@ -0,0 +1,143 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 16 x 8 + + zmm0 zmm1 ... zmm8 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + + +{% include "preamble.tmpliq" size:"16x8", suffix:suffix, G:G, arch:"avx512" %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rbx, rbx + jz {{L}}non_linear_loop + + cmp rbx, 2 + jl {{L}}main_loop_packed_packed_tail + +{{align}} 16 +{{L}}main_loop_packed_packed: + {% include "8x8/packed_packed_loop1/avx-512-unroll.tmpli" %} + + sub rbx, 2 + cmp rbx, 2 + jge {{L}}main_loop_packed_packed + + test rbx, rbx + jz {{L}}non_linear_loop + +{{align}} 16 +{{L}}main_loop_packed_packed_tail: + {% include "8x8/packed_packed_loop1/avx-512.tmpli" %} + + sub rbx, 1 + jnz {{L}}main_loop_packed_packed_tail + + jmp {{L}}non_linear_loop + +{% include "f32_scalars.tmpliq" from:0, to:7 %} +{% include "f32_per_rows.tmpliq" mr:16, from:0, to:7 %} +{% include "f32_per_cols.tmpliq" mr:16, from:0, to:7 %} +{% include "avx512_mmm_load_tile.tmpliq" from:0, to:7 %} + +{{L}}add_unicast: + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + mov eax, 0 + +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm12, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm13, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 + vinsertf32x8 zmm14, zmm14, ymm13, 1 + +{% for i in (0..7) %} + kxnorw k1,k1,k1 + vgatherdps zmm12{k1}, [ r10 + zmm14 ] + add r10, rbx + vaddps zmm{{i}}, zmm{{i}}, zmm12 +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vmovups zmm12, zmmword ptr [rax] + +{% for i in (0..7) %} + vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] + vfmadd231ps zmm{{i}}, zmm12, zmm14 +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + // tops of cols + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r12, [ r8 + 4 * rbx ] + lea r11, [ r10 + rbx ] + lea r13, [ r12 + rbx ] + lea r14, [ r12 + 2 * rbx ] + lea r15, [ r13 + 2 * rbx ] + + {% for quarter in (0..3) %} + {% for r in (0..7) %} + vextractf32x4 xmm{{r | plus: 8}}, zmm{{r}}, {{quarter}} + {% endfor %} + {% for row in (0..3) %} + {% for i in (0..7) %} + vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 8}}, {{row}} + add r{{i | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" size:"16x8", suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x5.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x5.tmpl new file mode 100644 index 000000000..be4ac53fb --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x5.tmpl @@ -0,0 +1,144 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 32 x 5: + + zmm0 zmm2 zmm4 zmm6 zmm8 + zmm1 zmm3 zmm5 zmm7 zmm9 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" size:"32x5", suffix:suffix, G:G, arch:"avx512" %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rbx, rbx + jz {{L}}non_linear_loop + +{{L}}main_loop_packed_packed: + {% include "2x5/packed_packed_loop1/avx-512.tmpli" %} + + dec rbx + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +{% include "f32_scalars.tmpliq" from:0, to:9 %} +{% include "f32_per_rows.tmpliq" mr:32, from:0, to:9 %} +{% include "f32_per_cols.tmpliq" mr:32, from:0, to:9 %} +{% include "avx512_mmm_load_tile.tmpliq" from:0, to:9 %} + +{{L}}add_unicast: + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + mov eax, 0 + +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm12, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm13, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 + vinsertf32x8 zmm14, zmm14, ymm13, 1 + +{% for i in (0..4) %} + kxnorw k1,k1,k1 + vgatherdps zmm12{k1}, [ r10 + zmm14 ] + add r10, rbx + vaddps zmm{{i | times: 2}}, zmm{{i | times: 2}}, zmm12 +{% endfor %} + + imul esi, 16 + vpbroadcastd zmm15, esi + + mov r10, [rdi + 8] + vpaddd zmm14, zmm14, zmm15 + +{% for i in (0..4) %} + kxnorw k1,k1,k1 + vgatherdps zmm12{k1}, [ r10 + zmm14 ] + add r10, rbx + vaddps zmm{{i | times: 2 | plus: 1}}, zmm{{i | times: 2 | plus: 1}}, zmm12 +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vmovups zmm12, zmmword ptr [rax] + vmovups zmm13, zmmword ptr [rax+64] + +{% for i in (0..4) %} + vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] + vfmadd231ps zmm{{i | times: 2}}, zmm12, zmm14 + vfmadd231ps zmm{{i | times: 2 | plus: 1}}, zmm13, zmm14 +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + // tops of cols + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r11, [ r10 + rbx ] + lea r12, [ r10 + 2 * rbx ] + + {% for word in (0..1) %} + {% for quarter in (0..3) %} + {% for r in (0..4) %} + vextractf32x4 xmm{{r | plus: 11}}, zmm{{r | times: 2 | plus: word}}, {{quarter}} + {% endfor %} + {% for row in (0..3) %} + {% for i in (0..4) %} + vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 11}}, {{row}} + add r{{i | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" size:"32x5", suffix:suffix, G:G, L:L, arch:"avx512" %} + diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x6.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x6.tmpl new file mode 100644 index 000000000..acca978da --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x6.tmpl @@ -0,0 +1,161 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 32 x 6: + + zmm0 zmm2 zmm4 zmm6 zmm8 zmm10 + zmm1 zmm3 zmm5 zmm7 zmm9 zmm11 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" size:"32x6", suffix:suffix, G:G, arch:"avx512" %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rbx, rbx + jz {{L}}non_linear_loop + +{{L}}main_loop_packed_packed: + {% include "2x6/packed_packed_loop1/avx-512.tmpli" %} + + dec rbx + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +{% include "f32_scalars.tmpliq" from:0, to:11 %} +{% include "f32_per_rows.tmpliq" mr:32, from:0, to:11 %} +{% include "f32_per_cols.tmpliq" mr:32, from:0, to:11 %} +{% include "avx512_mmm_load_tile.tmpliq" from:0, to:11 %} + +{{L}}add_unicast: + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + mov eax, 0 + +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm12, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm13, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 + vinsertf32x8 zmm14, zmm14, ymm13, 1 + +{% for i in (0..5) %} + kxnorw k1,k1,k1 + vgatherdps zmm12{k1}, [ r10 + zmm14 ] + add r10, rbx + vaddps zmm{{i | times: 2}}, zmm{{i | times: 2}}, zmm12 +{% endfor %} + + mov r10, [rdi + 8] + imul esi, 16 + vpbroadcastd zmm15, esi + vpaddd zmm14, zmm14, zmm15 + +{% for i in (0..5) %} + kxnorw k1,k1,k1 + vgatherdps zmm12{k1}, [ r10 + zmm14 ] + add r10, rbx + vaddps zmm{{i | times: 2 | plus: 1}}, zmm{{i | times: 2 | plus: 1}}, zmm12 +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vmovups zmm12, zmmword ptr [rax] + vmovups zmm13, zmmword ptr [rax+64] + +{% for i in (0..5) %} + vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] + vfmadd231ps zmm{{i | times: 2}}, zmm12, zmm14 + vfmadd231ps zmm{{i | times: 2 | plus: 1}}, zmm13, zmm14 +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + // tops of cols + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r11, [ r10 + rbx ] + + {% for word in (0..1) %} + {% for quarter in (0..3) %} + {% for r in (0..2) %} + vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 2 | plus: word}}, {{quarter}} + {% endfor %} + {% for row in (0..3) %} + {% for i in (0..2) %} + vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} + add r{{i | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + {% endfor %} + + // tops of cols + mov r8, r11 + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + + {% for word in (0..1) %} + {% for quarter in (0..3) %} + {% for r in (0..2) %} + vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | plus: 3 | times: 2 | plus: word}}, {{quarter}} + {% endfor %} + {% for row in (0..3) %} + {% for i in (0..2) %} + vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} + add r{{i | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" size:"32x6", suffix:suffix, G:G, L:L, arch:"avx512" %} + diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_48x4.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_48x4.tmpl new file mode 100644 index 000000000..6f7e8b456 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_48x4.tmpl @@ -0,0 +1,148 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 48 x 4: + + zmm0 zmm3 zmm6 zmm9 + zmm1 zmm4 zmm7 zmm10 + zmm2 zmm5 zmm8 zmm11 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" size:"48x4", suffix:suffix, G:G, arch:"avx512" %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rbx, rbx + jz {{L}}non_linear_loop + +{{L}}main_loop_packed_packed: + {% include "3x4/packed_packed_loop1/avx-512.tmpli" %} + + dec rbx + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +{% include "f32_scalars.tmpliq" from:0, to:11 %} +{% include "f32_per_rows.tmpliq" mr:48, from:0, to:11 %} +{% include "f32_per_cols.tmpliq" mr:48, from:0, to:11 %} +{% include "avx512_mmm_load_tile.tmpliq" from:0, to:11 %} + +{{L}}add_unicast: + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + mov eax, 0 + +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm12, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm13, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 + vinsertf32x8 zmm14, zmm14, ymm13, 1 + +{% for i in (0..3) %} + kxnorw k1,k1,k1 + vgatherdps zmm12{k1}, [ r10 + zmm14 ] + add r10, rbx + vaddps zmm{{i | times: 3}}, zmm{{i | times: 3}}, zmm12 +{% endfor %} + + imul esi, 16 + vpbroadcastd zmm15, esi + +{% for j in (1..2) %} + mov r10, [rdi + 8] + vpaddd zmm14, zmm14, zmm15 + + {% for i in (0..3) %} + kxnorw k1,k1,k1 + vgatherdps zmm12{k1}, [ r10 + zmm14 ] + add r10, rbx + vaddps zmm{{i | times: 3 | plus: j}}, zmm{{i | times: 3 | plus: j}}, zmm12 + {% endfor %} +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vmovups zmm12, zmmword ptr [rax] + vmovups zmm13, zmmword ptr [rax+64] + vmovups zmm15, zmmword ptr [rax+128] + +{% for i in (0..3) %} + vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] + vfmadd231ps zmm{{i | times: 3}}, zmm12, zmm14 + vfmadd231ps zmm{{i | times: 3 | plus: 1}}, zmm13, zmm14 + vfmadd231ps zmm{{i | times: 3 | plus: 2}}, zmm15, zmm14 +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + // tops of cols + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r11, [ r10 + rbx ] + + {% for word in (0..2) %} + {% for quarter in (0..3) %} + {% for r in (0..3) %} + vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 3 | plus: word}}, {{quarter}} + {% endfor %} + {% for row in (0..3) %} + {% for i in (0..3) %} + vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} + add r{{i | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" size:"48x4", suffix:suffix, G:G, L:L, arch:"avx512" %} + diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_64x3.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_64x3.tmpl new file mode 100644 index 000000000..625f22aaa --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_64x3.tmpl @@ -0,0 +1,149 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 64 x 3: + + zmm0 zmm4 zmm8 + zmm1 zmm5 zmm9 + zmm2 zmm6 zmm10 + zmm3 zmm7 zmm11 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" size:"64x3", suffix:suffix, G:G, arch:"avx512" %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rbx, rbx + jz {{L}}non_linear_loop + +{{L}}main_loop_packed_packed: + {% include "4x3/packed_packed_loop1/avx-512.tmpli" %} + + dec rbx + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +{% include "f32_scalars.tmpliq" from:0, to:11 %} +{% include "f32_per_rows.tmpliq" mr:64, from:0, to:11 %} +{% include "f32_per_cols.tmpliq" mr:64, from:0, to:11 %} +{% include "avx512_mmm_load_tile.tmpliq" from:0, to:11 %} + +{{L}}add_unicast: + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + mov eax, 0 + +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm12, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm13, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 + vinsertf32x8 zmm14, zmm14, ymm13, 1 + +{% for i in (0..2) %} + kxnorw k1,k1,k1 + vgatherdps zmm12{k1}, [ r10 + zmm14 ] + add r10, rbx + vaddps zmm{{i | times: 4}}, zmm{{i | times: 4}}, zmm12 +{% endfor %} + + imul esi, 16 + vpbroadcastd zmm15, esi + +{% for j in (1..3) %} + mov r10, [rdi + 8] + vpaddd zmm14, zmm14, zmm15 + + {% for i in (0..2) %} + kxnorw k1,k1,k1 + vgatherdps zmm12{k1}, [ r10 + zmm14 ] + add r10, rbx + vaddps zmm{{i | times: 4 | plus: j}}, zmm{{i | times: 4 | plus: j}}, zmm12 + {% endfor %} +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vbroadcastss zmm13, dword ptr [rbx] + vbroadcastss zmm14, dword ptr [rbx+4] + vbroadcastss zmm15, dword ptr [rbx+8] + +{% for i in (0..3) %} + vmovups zmm12, zmmword ptr [rax+{{i | times:64}}] + vfmadd231ps zmm{{i}}, zmm12, zmm13 + vfmadd231ps zmm{{i | plus: 4}}, zmm12, zmm14 + vfmadd231ps zmm{{i | plus: 8}}, zmm12, zmm15 +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + // tops of cols + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r11, [ r10 + rbx ] + + {% for word in (0..3) %} + {% for quarter in (0..3) %} + {% for r in (0..2) %} + vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 4 | plus: word}}, {{quarter}} + {% endfor %} + {% for row in (0..3) %} + {% for i in (0..2) %} + vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} + add r{{i | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" size:"64x3", suffix:suffix, G:G, L:L, arch:"avx512" %} + diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_80x2.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_80x2.tmpl new file mode 100644 index 000000000..7350b784e --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_80x2.tmpl @@ -0,0 +1,148 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 80 x 2: + + zmm0 zmm5 + zmm1 zmm6 + zmm2 zmm7 + zmm3 zmm8 + zmm4 zmm9 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" size:"80x2", suffix:suffix, G:G, arch:"avx512" %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rbx, rbx + jz {{L}}non_linear_loop + +{{L}}main_loop_packed_packed: + {% include "5x2/packed_packed_loop1/avx-512.tmpli" %} + + dec rbx + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +{% include "f32_scalars.tmpliq" from:0, to:9 %} +{% include "f32_per_rows.tmpliq" mr:80, from:0, to:9 %} +{% include "f32_per_cols.tmpliq" mr:80, from:0, to:9 %} +{% include "avx512_mmm_load_tile.tmpliq" from:0, to:9 %} + +{{L}}add_unicast: + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + mov eax, 0 + +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm12, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm13, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 + vinsertf32x8 zmm14, zmm14, ymm13, 1 + +{% for i in (0..1) %} + kxnorw k1,k1,k1 + vgatherdps zmm12{k1}, [ r10 + zmm14 ] + add r10, rbx + vaddps zmm{{i | times: 5}}, zmm{{i | times: 5}}, zmm12 +{% endfor %} + + imul esi, 16 + vpbroadcastd zmm15, esi + +{% for j in (1..4) %} + mov r10, [rdi + 8] + vpaddd zmm14, zmm14, zmm15 + + {% for i in (0..1) %} + kxnorw k1,k1,k1 + vgatherdps zmm12{k1}, [ r10 + zmm14 ] + add r10, rbx + vaddps zmm{{i | times: 5 | plus: j}}, zmm{{i | times: 5 | plus: j}}, zmm12 + {% endfor %} +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vbroadcastss zmm14, dword ptr [rbx] + vbroadcastss zmm15, dword ptr [rbx+4] + +{% for i in (0..4) %} + vmovups zmm12, zmmword ptr [rax+{{i | times:64}}] + vfmadd231ps zmm{{i}}, zmm12, zmm14 + vfmadd231ps zmm{{i | plus: 5}}, zmm12, zmm15 +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + // tops of cols + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r11, [ r10 + rbx ] + + {% for word in (0..4) %} + {% for quarter in (0..3) %} + {% for r in (0..1) %} + vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 5 | plus: word}}, {{quarter}} + {% endfor %} + {% for row in (0..3) %} + {% for i in (0..1) %} + vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} + add r{{i | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" size:"80x2", suffix:suffix, G:G, L:L, arch:"avx512" %} + diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_load_tile.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_load_tile.tmpliq new file mode 100644 index 000000000..91c89ee82 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_load_tile.tmpliq @@ -0,0 +1,9 @@ +// vim: set syntax=asm : + +{{L}}load_tile: + mov r8, [rdi + 8] + {% for reg in (from..to) %} + vmovups zmm{{reg}}, zmmword ptr [r8 + {{ reg|minus:from|times:64 }}] + {% endfor %} + + jmp {{L}}non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/dispatcher.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/dispatcher.tmpliq new file mode 100644 index 000000000..1c63f72ad --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/dispatcher.tmpliq @@ -0,0 +1,40 @@ +// vim: set syntax=asm : + +{{L}}non_linear: + +{{L}}non_linear_loop_enter: + sub rdi, 40 +{{L}}non_linear_loop: + add rdi, 40 + mov rax, [rdi] + + mov r8, {{ jump_table | size }} + cmp rax, 0 + cmovl rax, r8 + cmp rax, {{ jump_table | size }} + cmovg rax, r8 + +{% if msvc %} + lea r8, [ offset {{L}}jmp_table ] +{% else %} + lea r8, [ rip + {{L}}jmp_table ] +{% endif %} + movsxd r9, dword ptr [ r8 + rax * 4 ] + lea r8, [ r8 + r9 ] + jmp r8 + +{{L}}jmp_table: +{% for j in jump_table %} + {{long}} {{L}}{{j}}-{{L}}jmp_table +{% endfor %} + {{long}} {{L}}unsupported-{{L}}jmp_table + +{{L}}unsupported: + mov rax, 1 + jmp {{L}}return + + +{{L}}done: + mov rax, 0 + jmp {{L}}return + diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_cols.tmpliq new file mode 100644 index 000000000..6d4097d41 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_cols.tmpliq @@ -0,0 +1,8 @@ +// vim: set syntax=asm : + +{% include "zmm_per_col.tmpliq" label:"per_col_min", op:"vminps", mr:mr, from:from, to:to %} +{% include "zmm_per_col.tmpliq" label:"per_col_max", op:"vmaxps", mr:mr, from:from, to:to %} +{% include "zmm_per_col.tmpliq" label:"per_col_add", op:"vaddps", mr:mr, from:from, to:to %} +{% include "zmm_per_col.tmpliq" label:"per_col_mul", op:"vmulps", mr:mr, from:from, to:to %} +{% include "zmm_per_col.tmpliq" label:"per_col_sub", op:"vsubps", from:from, to:to %} +{% include "zmm_per_col.tmpliq" label:"per_col_sub_flipped", op:"vsubps", from:from, to:to, flipped: true %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_rows.tmpliq new file mode 100644 index 000000000..b20fcbbbb --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_rows.tmpliq @@ -0,0 +1,8 @@ +// vim: set syntax=asm : + +{% include "zmm_per_row.tmpliq" label:"per_row_min", op:"vminps", mr:mr, from:from, to:to %} +{% include "zmm_per_row.tmpliq" label:"per_row_max", op:"vmaxps", mr:mr, from:from, to:to %} +{% include "zmm_per_row.tmpliq" label:"per_row_add", op:"vaddps", mr:mr, from:from, to:to %} +{% include "zmm_per_row.tmpliq" label:"per_row_mul", op:"vmulps", mr:mr, from:from, to:to %} +{% include "zmm_per_row.tmpliq" label:"per_row_sub", op:"vsubps", from:from, to:to %} +{% include "zmm_per_row.tmpliq" label:"per_row_sub_flipped", op:"vsubps", from:from, to:to, flipped: true %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_scalars.tmpliq new file mode 100644 index 000000000..7876d6cba --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_scalars.tmpliq @@ -0,0 +1,29 @@ +// vim: set syntax=asm : + +{% include "zmm_scalar.tmpliq" label:"scalar_min", op:"vminps", from:from, to:to %} +{% include "zmm_scalar.tmpliq" label:"scalar_max", op:"vmaxps", from:from, to:to %} +{% include "zmm_scalar.tmpliq" label:"scalar_add", op:"vaddps", from:from, to:to %} +{% include "zmm_scalar.tmpliq" label:"scalar_mul", op:"vmulps", from:from, to:to %} +{% include "zmm_scalar.tmpliq" label:"scalar_sub", op:"vsubps", from:from, to:to %} +{% include "zmm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vsubps", from:from, to:to, flipped: true %} + +{{L}}leaky_relu: + // can only use zmm12 to zmm15 + // ymm15 <- alpha + vbroadcastss zmm15, dword ptr [rdi + 8] + // ymm14 <- all zero + vpxorq zmm14, zmm14, zmm14 + + {% for reg in (from..to) %} + vcmpps k1, zmm{{reg}}, zmm14, 1 // 1 means LT + // ymm12 <- alpha * x if < 0 + vmulps zmm{{reg}} {k1}, zmm{{reg}}, zmm15 + {% endfor %} + // select muled of orginal + + jmp {{L}}non_linear_loop + +{{L}}q_scale: +{{L}}q_shl: +{{L}}q_shr: + jmp {{L}}unsupported diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_cols.tmpliq new file mode 100644 index 000000000..789bf77c2 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_cols.tmpliq @@ -0,0 +1,8 @@ +// vim: set syntax=asm : + +{% include "zmm_per_col.tmpliq" label:"per_col_min", op:"vpminsd", mr:mr, from:from, to:to%} +{% include "zmm_per_col.tmpliq" label:"per_col_max", op:"vpmaxsd", mr:mr, from:from, to:to%} +{% include "zmm_per_col.tmpliq" label:"per_col_add", op:"vpaddd", mr:mr, from:from, to:to%} +{% include "zmm_per_col.tmpliq" label:"per_col_mul", op:"vpmulld", mr:mr, from:from, to:to%} +{% include "zmm_per_col.tmpliq" label:"per_col_sub", op:"vpsubd", from:from, to:to%} +{% include "zmm_per_col.tmpliq" label:"per_col_sub_flipped", op:"vpsubd", from:from, to:to, flipped: true%} diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_rows.tmpliq new file mode 100644 index 000000000..5e21b01eb --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_rows.tmpliq @@ -0,0 +1,8 @@ +// vim: set syntax=asm : + +{% include "zmm_per_row.tmpliq" label:"per_row_min", op:"vpminsd", mr:mr, from:from, to:to%} +{% include "zmm_per_row.tmpliq" label:"per_row_max", op:"vpmaxsd", mr:mr, from:from, to:to%} +{% include "zmm_per_row.tmpliq" label:"per_row_add", op:"vpaddd", mr:mr, from:from, to:to%} +{% include "zmm_per_row.tmpliq" label:"per_row_mul", op:"vpmulld", mr:mr, from:from, to:to%} +{% include "zmm_per_row.tmpliq" label:"per_row_sub", op:"vpsubd", from:from, to:to%} +{% include "zmm_per_row.tmpliq" label:"per_row_sub_flipped", op:"vpsubd", from:from, to:to, flipped: true%} diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_scalars.tmpliq new file mode 100644 index 000000000..0b36e4910 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_scalars.tmpliq @@ -0,0 +1,10 @@ +// vim: set syntax=asm : +{% unless arch %} + {% assign arch = "ymm" %} +{% endunless %} +{% include "zmm_scalar.tmpliq" label:"scalar_min", op:"vpminsd", from:from, to:to, arch:arch %} +{% include "zmm_scalar.tmpliq" label:"scalar_max", op:"vpmaxsd", from:from, to:to, arch:arch %} +{% include "zmm_scalar.tmpliq" label:"scalar_mul", op:"vpmulld", from:from, to:to, arch:arch %} +{% include "zmm_scalar.tmpliq" label:"scalar_add", op:"vpaddd", from:from, to:to, arch:arch %} +{% include "zmm_scalar.tmpliq" label:"scalar_sub", op:"vpsubd", from:from, to:to, arch:arch %} +{% include "zmm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vpsubd", from:from, to:to, flipped: true, arch:arch %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/postamble.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/postamble.tmpliq new file mode 100644 index 000000000..ff3071a71 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/postamble.tmpliq @@ -0,0 +1,38 @@ +{{L}}return: + ldmxcsr [rsp + 4] + add rsp, 8 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + +{% if family == "windows" %} + pop rsi + pop rdi + + vmovaps xmm15, [rsp+16*9] + vmovaps xmm14, [rsp+16*8] + vmovaps xmm13, [rsp+16*7] + vmovaps xmm12, [rsp+16*6] + vmovaps xmm11, [rsp+16*5] + vmovaps xmm10, [rsp+16*4] + vmovaps xmm9, [rsp+16*3] + vmovaps xmm8, [rsp+16*2] + vmovaps xmm7, [rsp+16*1] + vmovaps xmm6, [rsp] +{% endif %} + + mov rsp, rbp + pop rbp + ret + +{% if msvc %} +{{arch}}_mmm_f32_{{size}}_{{suffix}} endp +_text ends +end + +{% else %} +.cfi_endproc +{% endif %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/preamble.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/preamble.tmpliq new file mode 100644 index 000000000..3ed2f7c30 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/preamble.tmpliq @@ -0,0 +1,63 @@ +{% if msvc %} + +_text segment +{{arch}}_mmm_f32_{{size}}_{{suffix}} proc + +{% else %} + +.intel_syntax noprefix +.text +.p2align 5 +.globl {{G}}{{arch}}_mmm_f32_{{size}}_{{suffix}} +{{G}}{{arch}}_mmm_f32_{{size}}_{{suffix}}: +.cfi_startproc + +{% endif %} + + push rbp + mov rbp, rsp + +{% if family == "windows" %} +// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch +// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers + and rsp,-16 + lea rsp,[rsp-160] + vmovaps [rsp], xmm6 + vmovaps [rsp+16*1],xmm7 + vmovaps [rsp+16*2],xmm8 + vmovaps [rsp+16*3],xmm9 + vmovaps [rsp+16*4],xmm10 + vmovaps [rsp+16*5],xmm11 + vmovaps [rsp+16*6],xmm12 + vmovaps [rsp+16*7],xmm13 + vmovaps [rsp+16*8],xmm14 + vmovaps [rsp+16*9],xmm15 + + push rdi + push rsi + + mov rdi, rcx + +{% endif %} + + push rbx + push r12 + push r13 + push r14 + push r15 + + sub rsp, 8 + +{% if family == "unix" %} +.cfi_def_cfa_offset 64 +{% endif %} + stmxcsr [rsp + 4] +{% if msvc %} + mov rax, 1FC0h +{% else %} + mov rax, 0x1FC0 +{% endif %} + mov [rsp], eax + ldmxcsr [rsp] + +{% include "dispatcher.tmpliq" %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/sigmoid_f32.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/sigmoid_f32.tmpl new file mode 100644 index 000000000..5c962c6f2 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/sigmoid_f32.tmpl @@ -0,0 +1,324 @@ +{% comment %} +// vim: set syntax=asm : + + +// TODO[TSolberg] : Not validated. + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 + return: rax (+rdx) + +{% endcomment %} + +{% if msvc %} + +_text segment +avx512_sigmoid_f32_{{suffix}} proc + +{% else %} + +.intel_syntax noprefix +.text +.p2align 5 +.globl {{G}}avx512_sigmoid_f32_{{suffix}} +{{G}}avx512_sigmoid_f32_{{suffix}}: +.cfi_startproc +{% endif %} + + push rbp + mov rbp, rsp + + +{% if family == "windows" %} +// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch +// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers + and rsp,-16 + lea rsp,[rsp-160] + vmovaps [rsp], xmm6 + vmovaps [rsp+16*1],xmm7 + vmovaps [rsp+16*2],xmm8 + vmovaps [rsp+16*3],xmm9 + vmovaps [rsp+16*4],xmm10 + vmovaps [rsp+16*5],xmm11 + vmovaps [rsp+16*6],xmm12 + vmovaps [rsp+16*7],xmm13 + vmovaps [rsp+16*8],xmm14 + vmovaps [rsp+16*9],xmm15 + + // move around arguments to mimick SysV rdi,rsi passing + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + +{% endif %} + + push rbx + push r12 + push r13 + push r14 + push r15 + + sub rsp, 8 + +{% if family == "unix" %} +// FIXME +// .cfi_def_cfa_offset 64 +{% endif %} + + stmxcsr [rsp + 4] +{% if msvc %} + mov rax, 1FC0h +{% else %} + mov rax, 0x1FC0 +{% endif %} + mov [rsp], eax + ldmxcsr [rsp] +// ---------------------------------------------------------------------- + +{%capture offset%}{% if msvc %} offset {%else%} rip + {%endif%} {%endcapture%} + + cmp rsi, 0 + je {{L}}done + + cmp rsi, 32 + jl {{L}}loop_1 + +{{L}}loop_4: + + vmovaps zmm4, [rdi] + vmovaps zmm5, [rdi + 64] + vmovaps zmm6, [rdi + 128] + vmovaps zmm7, [rdi + 192] + + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_low] + vbroadcastss zmm1, dword ptr [{{offset}} {{L}}coeffs_num_high] + vbroadcastss zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9] + vbroadcastss zmm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7] + + vmaxps zmm4, zmm4, zmm0 + vmaxps zmm5, zmm5, zmm0 + vmaxps zmm6, zmm6, zmm0 + vmaxps zmm7, zmm7, zmm0 + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5] + + vminps zmm4, zmm4, zmm1 + vminps zmm5, zmm5, zmm1 + vminps zmm6, zmm6, zmm1 + vminps zmm7, zmm7, zmm1 // zmm4..7 <- x + vbroadcastss zmm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3] + + vmulps zmm8, zmm4, zmm4 + vmulps zmm9, zmm5, zmm5 + vmulps zmm10, zmm6, zmm6 + vmulps zmm11, zmm7, zmm7 // zmm8..11 <- x^2 + + vmovaps zmm12, zmm2 + vmovaps zmm13, zmm2 + vmovaps zmm14, zmm2 + vmovaps zmm15, zmm2 + vbroadcastss zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1] + vfmadd132ps zmm12, zmm3, zmm8 + vfmadd132ps zmm13, zmm3, zmm9 + vfmadd132ps zmm14, zmm3, zmm10 + vfmadd132ps zmm15, zmm3, zmm11 + vbroadcastss zmm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_10] + vfmadd132ps zmm12, zmm0, zmm8 + vfmadd132ps zmm13, zmm0, zmm9 + vfmadd132ps zmm14, zmm0, zmm10 + vfmadd132ps zmm15, zmm0, zmm11 + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_8] + vfmadd132ps zmm12, zmm1, zmm8 + vfmadd132ps zmm13, zmm1, zmm9 + vfmadd132ps zmm14, zmm1, zmm10 + vfmadd132ps zmm15, zmm1, zmm11 + vbroadcastss zmm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6] + vfmadd132ps zmm12, zmm2, zmm8 + vfmadd132ps zmm13, zmm2, zmm9 + vfmadd132ps zmm14, zmm2, zmm10 + vfmadd132ps zmm15, zmm2, zmm11 + vbroadcastss zmm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4] + vmulps zmm4, zmm4, zmm12 + vmulps zmm5, zmm5, zmm13 + vmulps zmm6, zmm6, zmm14 + vmulps zmm7, zmm7, zmm15 // zmm4..7 <- num + + vmovaps zmm12, zmm3 + vmovaps zmm13, zmm3 + vmovaps zmm14, zmm3 + vmovaps zmm15, zmm3 + vbroadcastss zmm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2] + vfmadd132ps zmm12, zmm0, zmm8 + vfmadd132ps zmm13, zmm0, zmm9 + vfmadd132ps zmm14, zmm0, zmm10 + vfmadd132ps zmm15, zmm0, zmm11 + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0] + vfmadd132ps zmm12, zmm1, zmm8 + vfmadd132ps zmm13, zmm1, zmm9 + vfmadd132ps zmm14, zmm1, zmm10 + vfmadd132ps zmm15, zmm1, zmm11 + vbroadcastss zmm1, dword ptr [{{offset}} {{L}}coeffs_num_half] + vfmadd132ps zmm12, zmm2, zmm8 + vfmadd132ps zmm13, zmm2, zmm9 + vfmadd132ps zmm14, zmm2, zmm10 + vfmadd132ps zmm15, zmm2, zmm11 + vfmadd132ps zmm12, zmm3, zmm8 + vfmadd132ps zmm13, zmm3, zmm9 + vfmadd132ps zmm14, zmm3, zmm10 + vfmadd132ps zmm15, zmm3, zmm11 + vfmadd132ps zmm12, zmm0, zmm8 + vfmadd132ps zmm13, zmm0, zmm9 + vfmadd132ps zmm14, zmm0, zmm10 + vfmadd132ps zmm15, zmm0, zmm11 // zmm12..14 <- denum + + vdivps zmm4, zmm4, zmm12 + vdivps zmm5, zmm5, zmm13 + vdivps zmm6, zmm6, zmm14 + vdivps zmm7, zmm7, zmm15 + vaddps zmm4, zmm4, zmm1 + vaddps zmm5, zmm5, zmm1 + vaddps zmm6, zmm6, zmm1 + vaddps zmm7, zmm7, zmm1 + + vmovaps [rdi], zmm4 + vmovaps [rdi + 64], zmm5 + vmovaps [rdi + 128], zmm6 + vmovaps [rdi + 192], zmm7 + + add rdi, 256 + sub rsi, 32 + cmp rsi, 32 + jg {{L}}loop_4 + + cmp rsi, 0 + je {{L}}done + +{{L}}loop_1: + vmovaps zmm4, [rdi] + + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_low] + vbroadcastss zmm1, dword ptr [{{offset}} {{L}}coeffs_num_high] + vbroadcastss zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9] + vbroadcastss zmm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7] + + vmaxps zmm4, zmm4, zmm0 + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5] + + vminps zmm4, zmm4, zmm1 // zmm4 <- x + vbroadcastss zmm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3] + + vmulps zmm8, zmm4, zmm4 // zmm8 <- x^2 + + vmovaps zmm12, zmm2 + vbroadcastss zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1] + vfmadd132ps zmm12, zmm3, zmm8 + vbroadcastss zmm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_10] + vfmadd132ps zmm12, zmm0, zmm8 + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_8] + vfmadd132ps zmm12, zmm1, zmm8 + vbroadcastss zmm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6] + vfmadd132ps zmm12, zmm2, zmm8 + vbroadcastss zmm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4] + vmulps zmm4, zmm4, zmm12 + + vmovaps zmm12, zmm3 + vbroadcastss zmm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2] + vfmadd132ps zmm12, zmm0, zmm8 + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0] + vfmadd132ps zmm12, zmm1, zmm8 + vbroadcastss zmm1, dword ptr [{{offset}} {{L}}coeffs_num_half] + vfmadd132ps zmm12, zmm2, zmm8 + vfmadd132ps zmm12, zmm3, zmm8 + vfmadd132ps zmm12, zmm0, zmm8 + + vdivps zmm4, zmm4, zmm12 + vaddps zmm4, zmm4, zmm1 + + vmovaps [rdi], zmm4 + add rdi, 32 + sub rsi, 8 + jnz {{L}}loop_1 + +{{L}}done: + +// ---------------------------------------------------------------------- + + ldmxcsr [rsp + 4] + + add rsp, 8 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + +{% if family == "windows" %} + pop rsi + pop rdi + + vmovaps xmm15, [rsp+16*9] + vmovaps xmm14, [rsp+16*8] + vmovaps xmm13, [rsp+16*7] + vmovaps xmm12, [rsp+16*6] + vmovaps xmm11, [rsp+16*5] + vmovaps xmm10, [rsp+16*4] + vmovaps xmm9, [rsp+16*3] + vmovaps xmm8, [rsp+16*2] + vmovaps xmm7, [rsp+16*1] + vmovaps xmm6, [rsp] +{% endif %} + + mov rsp, rbp + pop rbp + ret + +{%capture float%}{% if msvc %} real4 {%else%} .float {%endif%}{%endcapture%} + +{{L}}coeffs_num_low: + {{float}} -18.0 // low +{{L}}coeffs_num_high: + {{float}} 18.0 // high + +{{L}}coeffs_num_alpha_9: + {{float}} 4.37031012579801e-11 // alpha_9 +{{L}}coeffs_num_alpha_7: + {{float}} 1.15627324459942e-07 // alpha_7 +{{L}}coeffs_num_alpha_5: + {{float}} 6.08574864600143e-05 // alpha_5 +{{L}}coeffs_num_alpha_3: + {{float}} 8.51377133304701e-03 // alpha_3 +{{L}}coeffs_num_alpha_1: + {{float}} 2.48287947061529e-01 // alpha_1 + +{{L}}coeffs_num_beta_10: + {{float}} 6.10247389755681e-13 +{{L}}coeffs_num_beta_8: + {{float}} 5.76102136993427e-09 +{{L}}coeffs_num_beta_6: + {{float}} 6.29106785017040e-06 // beta_6 +{{L}}coeffs_num_beta_4: + {{float}} 1.70198817374094e-03 // beta_4 +{{L}}coeffs_num_beta_2: + {{float}} 1.16817656904453e-01 // beta_2 +{{L}}coeffs_num_beta_0: + {{float}} 9.93151921023180e-01 // beta_0 + +{{L}}coeffs_num_half: + {{float}} 0.5 + +{% if msvc %} +avx512_sigmoid_f32_{{suffix}} endp +_text ends +end +{% else %} +.cfi_endproc +{% endif %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/tanh_f32.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/tanh_f32.tmpl new file mode 100644 index 000000000..dc4b0f07a --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/tanh_f32.tmpl @@ -0,0 +1,313 @@ +{% comment %} +// vim: set syntax=asm : + +// TODO[TSolberg] : Not validated. + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 + return: rax (+rdx) + +{% endcomment %} + +{% if msvc %} + +_text segment +avx512_tanh_f32_{{suffix}} proc + +{% else %} + +.intel_syntax noprefix +.text +.p2align 5 +.globl {{G}}avx512_tanh_f32_{{suffix}} +{{G}}avx512_tanh_f32_{{suffix}}: +.cfi_startproc +{% endif %} + + push rbp + mov rbp, rsp + + +{% if family == "windows" %} +// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch +// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers + and rsp,-16 + lea rsp,[rsp-160] + vmovaps [rsp], xmm6 + vmovaps [rsp+16*1],xmm7 + vmovaps [rsp+16*2],xmm8 + vmovaps [rsp+16*3],xmm9 + vmovaps [rsp+16*4],xmm10 + vmovaps [rsp+16*5],xmm11 + vmovaps [rsp+16*6],xmm12 + vmovaps [rsp+16*7],xmm13 + vmovaps [rsp+16*8],xmm14 + vmovaps [rsp+16*9],xmm15 + + // move around arguments to mimick SysV rdi,rsi passing + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + +{% endif %} + + push rbx + push r12 + push r13 + push r14 + push r15 + + sub rsp, 8 + +{% if family == "unix" %} +// FIXME +// .cfi_def_cfa_offset 64 +{% endif %} + + stmxcsr [rsp + 4] +{% if msvc %} + mov rax, 1FC0h +{% else %} + mov rax, 0x1FC0 +{% endif %} + mov [rsp], eax + ldmxcsr [rsp] +// ---------------------------------------------------------------------- + +{%capture offset%}{% if msvc %} offset {%else%} rip + {%endif%} {%endcapture%} + + cmp rsi, 0 + je {{L}}done + + cmp rsi, 32 + jl {{L}}loop_1 + +{{L}}loop_4: + + vmovaps zmm4, [rdi] + vmovaps zmm5, [rdi + 64] + vmovaps zmm6, [rdi + 128] + vmovaps zmm7, [rdi + 192] + + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_low] + vbroadcastss zmm1, dword ptr [{{offset}} {{L}}coeffs_num_high] + vbroadcastss zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_13] + vbroadcastss zmm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_11] + + vmaxps zmm4, zmm4, zmm0 + vmaxps zmm5, zmm5, zmm0 + vmaxps zmm6, zmm6, zmm0 + vmaxps zmm7, zmm7, zmm0 + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9] + + vminps zmm4, zmm4, zmm1 + vminps zmm5, zmm5, zmm1 + vminps zmm6, zmm6, zmm1 + vminps zmm7, zmm7, zmm1 // zmm4..7 <- x + vbroadcastss zmm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7] + + vmulps zmm8, zmm4, zmm4 + vmulps zmm9, zmm5, zmm5 + vmulps zmm10, zmm6, zmm6 + vmulps zmm11, zmm7, zmm7 // zmm8..11 <- x^2 + + vmovaps zmm12, zmm2 + vmovaps zmm13, zmm2 + vmovaps zmm14, zmm2 + vmovaps zmm15, zmm2 + vbroadcastss zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5] + vfmadd132ps zmm12, zmm3, zmm8 + vfmadd132ps zmm13, zmm3, zmm9 + vfmadd132ps zmm14, zmm3, zmm10 + vfmadd132ps zmm15, zmm3, zmm11 + vbroadcastss zmm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3] + vfmadd132ps zmm12, zmm0, zmm8 + vfmadd132ps zmm13, zmm0, zmm9 + vfmadd132ps zmm14, zmm0, zmm10 + vfmadd132ps zmm15, zmm0, zmm11 + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1] + vfmadd132ps zmm12, zmm1, zmm8 + vfmadd132ps zmm13, zmm1, zmm9 + vfmadd132ps zmm14, zmm1, zmm10 + vfmadd132ps zmm15, zmm1, zmm11 + vbroadcastss zmm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6] + vfmadd132ps zmm12, zmm2, zmm8 + vfmadd132ps zmm13, zmm2, zmm9 + vfmadd132ps zmm14, zmm2, zmm10 + vfmadd132ps zmm15, zmm2, zmm11 + vbroadcastss zmm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4] + vfmadd132ps zmm12, zmm3, zmm8 + vfmadd132ps zmm13, zmm3, zmm9 + vfmadd132ps zmm14, zmm3, zmm10 + vfmadd132ps zmm15, zmm3, zmm11 + vbroadcastss zmm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2] + vfmadd132ps zmm12, zmm0, zmm8 + vfmadd132ps zmm13, zmm0, zmm9 + vfmadd132ps zmm14, zmm0, zmm10 + vfmadd132ps zmm15, zmm0, zmm11 + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0] + vmulps zmm4, zmm4, zmm12 + vmulps zmm5, zmm5, zmm13 + vmulps zmm6, zmm6, zmm14 + vmulps zmm7, zmm7, zmm15 // zmm4..7 <- num + + vmovaps zmm12, zmm1 + vmovaps zmm13, zmm1 + vmovaps zmm14, zmm1 + vmovaps zmm15, zmm1 + vfmadd132ps zmm12, zmm2, zmm8 + vfmadd132ps zmm13, zmm2, zmm9 + vfmadd132ps zmm14, zmm2, zmm10 + vfmadd132ps zmm15, zmm2, zmm11 + vfmadd132ps zmm12, zmm3, zmm8 + vfmadd132ps zmm13, zmm3, zmm9 + vfmadd132ps zmm14, zmm3, zmm10 + vfmadd132ps zmm15, zmm3, zmm11 + vfmadd132ps zmm12, zmm0, zmm8 + vfmadd132ps zmm13, zmm0, zmm9 + vfmadd132ps zmm14, zmm0, zmm10 + vfmadd132ps zmm15, zmm0, zmm11 // zmm12..14 <- denum + + vdivps zmm4, zmm4, zmm12 + vdivps zmm5, zmm5, zmm13 + vdivps zmm6, zmm6, zmm14 + vdivps zmm7, zmm7, zmm15 + + vmovaps [rdi], zmm4 + vmovaps [rdi + 64], zmm5 + vmovaps [rdi + 128], zmm6 + vmovaps [rdi + 192], zmm7 + + add rdi, 256 + sub rsi, 32 + cmp rsi, 32 + jg {{L}}loop_4 + + cmp rsi, 0 + je {{L}}done + +{{L}}loop_1: + vmovaps zmm4, [rdi] + + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_low] + vbroadcastss zmm1, dword ptr [{{offset}} {{L}}coeffs_num_high] + vbroadcastss zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_13] + vbroadcastss zmm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_11] + + vmaxps zmm4, zmm4, zmm0 + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9] + + vminps zmm4, zmm4, zmm1 // zmm4 <- x + vbroadcastss zmm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7] + + vmulps zmm8, zmm4, zmm4 // zmm8 <- x^2 + + vmovaps zmm12, zmm2 + vbroadcastss zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5] + vfmadd132ps zmm12, zmm3, zmm8 + vbroadcastss zmm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3] + vfmadd132ps zmm12, zmm0, zmm8 + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1] + vfmadd132ps zmm12, zmm1, zmm8 + vbroadcastss zmm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6] + vfmadd132ps zmm12, zmm2, zmm8 + vbroadcastss zmm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4] + vfmadd132ps zmm12, zmm3, zmm8 + vbroadcastss zmm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2] + vfmadd132ps zmm12, zmm0, zmm8 + vbroadcastss zmm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0] + vmulps zmm4, zmm4, zmm12 + + vmovaps zmm12, zmm1 + vfmadd132ps zmm12, zmm2, zmm8 + vfmadd132ps zmm12, zmm3, zmm8 + vfmadd132ps zmm12, zmm0, zmm8 + + vdivps zmm4, zmm4, zmm12 + + vmovaps [rdi], zmm4 + add rdi, 32 + sub rsi, 8 + jnz {{L}}loop_1 + +{{L}}done: + +// ---------------------------------------------------------------------- + + ldmxcsr [rsp + 4] + + add rsp, 8 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + +{% if family == "windows" %} + pop rsi + pop rdi + + vmovaps xmm15, [rsp+16*9] + vmovaps xmm14, [rsp+16*8] + vmovaps xmm13, [rsp+16*7] + vmovaps xmm12, [rsp+16*6] + vmovaps xmm11, [rsp+16*5] + vmovaps xmm10, [rsp+16*4] + vmovaps xmm9, [rsp+16*3] + vmovaps xmm8, [rsp+16*2] + vmovaps xmm7, [rsp+16*1] + vmovaps xmm6, [rsp] +{% endif %} + + mov rsp, rbp + pop rbp + ret + +{%capture float%}{% if msvc %} real4 {%else%} .float {%endif%}{%endcapture%} + +{{L}}coeffs_num_low: + {{float}} -9.0 // low +{{L}}coeffs_num_high: + {{float}} 9.0 // high + +{{L}}coeffs_num_alpha_13: + {{float}} -2.76076847742355e-16 // alpha_13 +{{L}}coeffs_num_alpha_11: + {{float}} 2.00018790482477e-13 // alpha_11 +{{L}}coeffs_num_alpha_9: + {{float}} -8.60467152213735e-11 // alpha_9 +{{L}}coeffs_num_alpha_7: + {{float}} 5.12229709037114e-08 // alpha_7 +{{L}}coeffs_num_alpha_5: + {{float}} 1.48572235717979e-05 // alpha_5 +{{L}}coeffs_num_alpha_3: + {{float}} 6.37261928875436e-04 // alpha_3 +{{L}}coeffs_num_alpha_1: + {{float}} 4.89352455891786e-03 // alpha_1 + +{{L}}coeffs_num_beta_6: + {{float}} 1.19825839466702e-06 // beta_6 +{{L}}coeffs_num_beta_4: + {{float}} 1.18534705686654e-04 // beta_4 +{{L}}coeffs_num_beta_2: + {{float}} 2.26843463243900e-03 // beta_2 +{{L}}coeffs_num_beta_0: + {{float}} 4.89352518554385e-03 // beta_0 + +{% if msvc %} +avx512_tanh_f32_{{suffix}} endp +_text ends +end +{% else %} +.cfi_endproc +{% endif %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_col.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_col.tmpliq new file mode 100644 index 000000000..16c9d32eb --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_col.tmpliq @@ -0,0 +1,29 @@ +// vim: set syntax=asm : + +{{L}}{{label}}: + mov rax, [ rdi + 8 ] + +{% capture mr_over_16 %}{{ mr | divided_by: 16}}{%endcapture%} +{% capture mr_over_16_min_1 %}{{ mr | divided_by: 16 | minus: 1}}{%endcapture%} + +{%capture tmp%}{{to | plus: 1 }}{%endcapture%} + +{%capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_16}}{%endcapture%} +{%capture cols_min_1%}{{to | plus: 1| minus:from| divided_by:mr_over_16|minus:1}}{%endcapture%} +// {{to|minus:from|plus:1}} cols:{{cols}} + +{% for right in (0..cols_min_1) %} + vbroadcastss zmm{{tmp}}, dword ptr [ rax ] + add rax, 4 + + {% for down in (0..mr_over_16_min_1) %} + {%capture acc%}{{mr_over_16|times:right|plus:from|plus:down}}{%endcapture%} + {% if flipped %} + {{op}} zmm{{acc}}, zmm{{acc}}, zmm{{tmp}} + {% else %} + {{op}} zmm{{acc}}, zmm{{tmp}}, zmm{{acc}} + {% endif %} + {% endfor %} +{% endfor %} + + jmp {{L}}non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_row.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_row.tmpliq new file mode 100644 index 000000000..f9da1b35f --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_row.tmpliq @@ -0,0 +1,23 @@ +// vim: set syntax=asm : + +{{L}}{{label}}: + mov rax, [ rdi + 8 ] + +{% capture mr_over_16 %}{{ mr | divided_by: 16}}{%endcapture%} +{% capture mr_over_16_min_1 %}{{ mr | divided_by: 16 | minus: 1}}{%endcapture%} + +{% for ix in (0..mr_over_16_min_1) %} + vmovups zmm{{to | plus: 1 | plus: ix}}, [rax + {{ix | times: 64}}] +{% endfor %} + +{% if flipped %} + {% for acc in (from..to) %} + {{op}} zmm{{acc}}, zmm{{acc}}, zmm{{ acc | modulo: mr_over_16 | plus: to | plus: 1 }} + {% endfor %} +{% else %} + {% for acc in (from..to) %} + {{op}} zmm{{acc}}, zmm{{ acc | modulo: mr_over_16 | plus: to | plus: 1 }}, zmm{{acc}} + {% endfor %} +{% endif %} + + jmp {{L}}non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_scalar.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_scalar.tmpliq new file mode 100644 index 000000000..43373c9d8 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_scalar.tmpliq @@ -0,0 +1,15 @@ +// vim: set syntax=asm : + +{{L}}{{label}}: + vbroadcastss zmm12, dword ptr [rdi + 8] + {% if flipped %} + {% for reg in (from..to) %} + {{op}} zmm{{reg}}, zmm{{reg}}, zmm12 + {% endfor %} + {% else %} + {% for reg in (from..to) %} + {{op}} zmm{{reg}}, zmm12, zmm{{reg}} + {% endfor %} + {% endif %} + + jmp {{L}}non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx-unroll.tmpli new file mode 100644 index 000000000..93e56994b --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx-unroll.tmpli @@ -0,0 +1,58 @@ + // Accumulators: 0-7 + // Columns: 14-15 + // Rows: 8-13 + + vbroadcastss ymm15, dword ptr [rcx] + + vmovaps ymm10, [rax + 0] + vmovaps ymm11, [rax + 32] + vmovaps ymm12, [rax + 64] + vmovaps ymm13, [rax + 96] + vmovaps ymm14, [rax + 128] + + vfmadd231ps ymm0, ymm10, ymm15 + vfmadd231ps ymm1, ymm11, ymm15 + vfmadd231ps ymm2, ymm12, ymm15 + vfmadd231ps ymm3, ymm13, ymm15 + vfmadd231ps ymm4, ymm14, ymm15 + + vmovaps ymm10, [rax + 160] + vmovaps ymm11, [rax + 192] + vmovaps ymm12, [rax + 224] + vmovaps ymm13, [rax + 256] + vmovaps ymm14, [rax + 288] + + vfmadd231ps ymm5, ymm10, ymm15 + vfmadd231ps ymm6, ymm11, ymm15 + vfmadd231ps ymm7, ymm12, ymm15 + vfmadd231ps ymm8, ymm13, ymm15 + vfmadd231ps ymm9, ymm14, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 4] + + vmovaps ymm10, [rax + 320] + vmovaps ymm11, [rax + 352] + vmovaps ymm12, [rax + 384] + vmovaps ymm13, [rax + 416] + vmovaps ymm14, [rax + 448] + + vfmadd231ps ymm0, ymm10, ymm15 + vfmadd231ps ymm1, ymm11, ymm15 + vfmadd231ps ymm2, ymm12, ymm15 + vfmadd231ps ymm3, ymm13, ymm15 + vfmadd231ps ymm4, ymm14, ymm15 + + vmovaps ymm10, [rax + 480] + vmovaps ymm11, [rax + 512] + vmovaps ymm12, [rax + 544] + vmovaps ymm13, [rax + 576] + vmovaps ymm14, [rax + 608] + + vfmadd231ps ymm5, ymm10, ymm15 + vfmadd231ps ymm6, ymm11, ymm15 + vfmadd231ps ymm7, ymm12, ymm15 + vfmadd231ps ymm8, ymm13, ymm15 + vfmadd231ps ymm9, ymm14, ymm15 + + add rcx, 8 + add rax, 640 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx.tmpli new file mode 100644 index 000000000..d29f839e8 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx.tmpli @@ -0,0 +1,33 @@ + // Tile size: 10x1 + // Accumulators: 0-9 + // Col regs: 10-14 + // Row regs: 15 + + vbroadcastss ymm15, dword ptr [rcx] + + vmovaps ymm10, [rax + 0] + vmovaps ymm11, [rax + 32] + vmovaps ymm12, [rax + 64] + vmovaps ymm13, [rax + 96] + vmovaps ymm14, [rax + 128] + + vfmadd231ps ymm0, ymm10, ymm15 + vfmadd231ps ymm1, ymm11, ymm15 + vfmadd231ps ymm2, ymm12, ymm15 + vfmadd231ps ymm3, ymm13, ymm15 + vfmadd231ps ymm4, ymm14, ymm15 + + vmovaps ymm10, [rax + 160] + vmovaps ymm11, [rax + 192] + vmovaps ymm12, [rax + 224] + vmovaps ymm13, [rax + 256] + vmovaps ymm14, [rax + 288] + + vfmadd231ps ymm5, ymm10, ymm15 + vfmadd231ps ymm6, ymm11, ymm15 + vfmadd231ps ymm7, ymm12, ymm15 + vfmadd231ps ymm8, ymm13, ymm15 + vfmadd231ps ymm9, ymm14, ymm15 + + add rcx, 4 + add rax, 320 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx-unroll.tmpli new file mode 100644 index 000000000..6cb824665 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx-unroll.tmpli @@ -0,0 +1,52 @@ + // Accumulators: 0-9 + // Columns: 14-15 + // Rows: 10-13 + vbroadcastss ymm10, dword ptr [rcx] + vbroadcastss ymm11, dword ptr [rcx + 4] + vbroadcastss ymm12, dword ptr [rcx + 8] + vbroadcastss ymm13, dword ptr [rcx + 12] + + vmovaps ymm14, [rax] + vmovaps ymm15, [rax + 32] + + vfmadd231ps ymm0, ymm14, ymm10 + vfmadd231ps ymm1, ymm15, ymm10 + + vfmadd231ps ymm2, ymm14, ymm11 + vfmadd231ps ymm3, ymm15, ymm11 + + vbroadcastss ymm11, dword ptr [rcx + 16] + + vfmadd231ps ymm4, ymm14, ymm12 + vfmadd231ps ymm5, ymm15, ymm12 + + vfmadd231ps ymm6, ymm14, ymm13 + vfmadd231ps ymm7, ymm15, ymm13 + + vfmadd231ps ymm8, ymm14, ymm11 + vfmadd231ps ymm9, ymm15, ymm11 + + vbroadcastss ymm10, dword ptr [rcx + 20] + vbroadcastss ymm11, dword ptr [rcx + 24] + vbroadcastss ymm12, dword ptr [rcx + 28] + vbroadcastss ymm13, dword ptr [rcx + 32] + + vmovaps ymm14, [rax + 64] + vmovaps ymm15, [rax + 96] + + vfmadd231ps ymm0, ymm14, ymm10 + vfmadd231ps ymm1, ymm15, ymm10 + + vfmadd231ps ymm2, ymm14, ymm11 + vfmadd231ps ymm3, ymm15, ymm11 + + vbroadcastss ymm11, dword ptr [rcx + 36] + + vfmadd231ps ymm4, ymm14, ymm12 + vfmadd231ps ymm5, ymm15, ymm12 + + vfmadd231ps ymm6, ymm14, ymm13 + vfmadd231ps ymm7, ymm15, ymm13 + + vfmadd231ps ymm8, ymm14, ymm11 + vfmadd231ps ymm9, ymm15, ymm11 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx.tmpli new file mode 100644 index 000000000..59a29b6ca --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx.tmpli @@ -0,0 +1,30 @@ + // Accumulators: 0-9 + // Columns: 14-15 + // Rows: 10-13 + vbroadcastss ymm10, dword ptr [rcx] + vbroadcastss ymm11, dword ptr [rcx + 4] + vbroadcastss ymm12, dword ptr [rcx + 8] + vbroadcastss ymm13, dword ptr [rcx + 12] + + vmovaps ymm14, [rax] + vmovaps ymm15, [rax + 32] + + vfmadd231ps ymm0, ymm14, ymm10 + vfmadd231ps ymm1, ymm15, ymm10 + + vfmadd231ps ymm2, ymm14, ymm11 + vfmadd231ps ymm3, ymm15, ymm11 + + // Use register 11 as it's "middle" use, leading to a decent + // trade-off between required use next iteration and when it has + // to be used this iteration. + vbroadcastss ymm11, dword ptr [rcx + 16] + + vfmadd231ps ymm4, ymm14, ymm12 + vfmadd231ps ymm5, ymm15, ymm12 + + vfmadd231ps ymm6, ymm14, ymm13 + vfmadd231ps ymm7, ymm15, ymm13 + + vfmadd231ps ymm8, ymm14, ymm11 + vfmadd231ps ymm9, ymm15, ymm11 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original-unroll.tmpli new file mode 100644 index 000000000..c41328bf2 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original-unroll.tmpli @@ -0,0 +1,71 @@ + // Tile size: 2x6 + // Accumulators: 0-11 + // Col regs: ymm14-15 + // Row regs: ymm12-13 + + vbroadcastss ymm14, dword ptr [rcx] + vmovaps ymm12, [rax] + vmovaps ymm13, [rax + 32] + vbroadcastss ymm15, dword ptr [rcx + 4] + + vfmadd231ps ymm0, ymm12, ymm14 + vfmadd231ps ymm1, ymm13, ymm14 + + vbroadcastss ymm14, dword ptr [rcx + 8] + + vfmadd231ps ymm2, ymm12, ymm15 + vfmadd231ps ymm3, ymm13, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 12] + + vfmadd231ps ymm4, ymm12, ymm14 + vfmadd231ps ymm5, ymm13, ymm14 + + vbroadcastss ymm14, dword ptr [rcx + 16] + + vfmadd231ps ymm6, ymm12, ymm15 + vfmadd231ps ymm7, ymm13, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 20] + + vfmadd231ps ymm8, ymm12, ymm14 + vfmadd231ps ymm9, ymm13, ymm14 + + vbroadcastss ymm14, dword ptr [rcx+24] + + vfmadd231ps ymm10, ymm12, ymm15 + vfmadd231ps ymm11, ymm13, ymm15 + + // Iteration two + vmovaps ymm12, [rax + 64] + vmovaps ymm13, [rax + 96] + vbroadcastss ymm15, dword ptr [rcx + 24 + 4] + + vfmadd231ps ymm0, ymm12, ymm14 + vfmadd231ps ymm1, ymm13, ymm14 + + vbroadcastss ymm14, dword ptr [rcx + 24 + 8] + + vfmadd231ps ymm2, ymm12, ymm15 + vfmadd231ps ymm3, ymm13, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 24 + 12] + + vfmadd231ps ymm4, ymm12, ymm14 + vfmadd231ps ymm5, ymm13, ymm14 + + vbroadcastss ymm14, dword ptr [rcx + 24 + 16] + + vfmadd231ps ymm6, ymm12, ymm15 + vfmadd231ps ymm7, ymm13, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 24 + 20] + + vfmadd231ps ymm8, ymm12, ymm14 + vfmadd231ps ymm9, ymm13, ymm14 + + vfmadd231ps ymm10, ymm12, ymm15 + vfmadd231ps ymm11, ymm13, ymm15 + + add rax, 128 + add rcx, 48 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original.tmpli new file mode 100644 index 000000000..1c7fc2765 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original.tmpli @@ -0,0 +1,39 @@ + // Tile size: 2x6 + // Accumulators: 0-11 + // Col regs: ymm14-15 + // Row regs: ymm12-13 + + // Load ordered by earliest use for first 2x2 block + vbroadcastss ymm14, dword ptr [rcx] + vmovaps ymm12, [rax] + vmovaps ymm13, [rax + 32] + vbroadcastss ymm15, dword ptr [rcx + 4] + + vfmadd231ps ymm0, ymm12, ymm14 + vfmadd231ps ymm1, ymm13, ymm14 + + vbroadcastss ymm14, dword ptr [rcx + 8] + + vfmadd231ps ymm2, ymm12, ymm15 + vfmadd231ps ymm3, ymm13, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 12] + + vfmadd231ps ymm4, ymm12, ymm14 + vfmadd231ps ymm5, ymm13, ymm14 + + vbroadcastss ymm14, dword ptr [rcx + 16] + + vfmadd231ps ymm6, ymm12, ymm15 + vfmadd231ps ymm7, ymm13, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 20] + + vfmadd231ps ymm8, ymm12, ymm14 + vfmadd231ps ymm9, ymm13, ymm14 + + vfmadd231ps ymm10, ymm12, ymm15 + vfmadd231ps ymm11, ymm13, ymm15 + + add rax, 64 + add rcx, 24 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx-unroll.tmpli new file mode 100644 index 000000000..a9e6ea33f --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx-unroll.tmpli @@ -0,0 +1,60 @@ + // Tile size: 3x4 + // Accumulators: 0-11 + // Col regs: ymm12-14 + // Row regs: ymm15 + + vmovaps ymm12, [rax] + vmovaps ymm13, [rax+32] + vmovaps ymm14, [rax+64] + + vbroadcastss ymm15, dword ptr [rcx + 0] + + vfmadd231ps ymm0, ymm12, ymm15 + vfmadd231ps ymm1, ymm13, ymm15 + vfmadd231ps ymm2, ymm14, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 4] + + vfmadd231ps ymm3, ymm12, ymm15 + vfmadd231ps ymm4, ymm13, ymm15 + vfmadd231ps ymm5, ymm14, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 8] + + vfmadd231ps ymm6, ymm12, ymm15 + vfmadd231ps ymm7, ymm13, ymm15 + vfmadd231ps ymm8, ymm14, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 12] + + vfmadd231ps ymm9, ymm12, ymm15 + vfmadd231ps ymm10, ymm13, ymm15 + vfmadd231ps ymm11, ymm14, ymm15 + + vmovaps ymm12, [rax + 96] + vmovaps ymm13, [rax + 128] + vmovaps ymm14, [rax + 160] + + vbroadcastss ymm15, dword ptr [rcx + 16] + + vfmadd231ps ymm0, ymm12, ymm15 + vfmadd231ps ymm1, ymm13, ymm15 + vfmadd231ps ymm2, ymm14, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 20] + + vfmadd231ps ymm3, ymm12, ymm15 + vfmadd231ps ymm4, ymm13, ymm15 + vfmadd231ps ymm5, ymm14, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 24] + + vfmadd231ps ymm6, ymm12, ymm15 + vfmadd231ps ymm7, ymm13, ymm15 + vfmadd231ps ymm8, ymm14, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 28] + + vfmadd231ps ymm9, ymm12, ymm15 + vfmadd231ps ymm10, ymm13, ymm15 + vfmadd231ps ymm11, ymm14, ymm15 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx.tmpli new file mode 100644 index 000000000..eff5cd237 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx.tmpli @@ -0,0 +1,32 @@ + // Tile size: 3x4 + // Accumulators: 0-11 + // Col regs: ymm12-14 + // Row regs: ymm15 + + vmovaps ymm12, [rax] + vmovaps ymm13, [rax+32] + vmovaps ymm14, [rax+64] + + vbroadcastss ymm15, dword ptr [rcx + 0] + + vfmadd231ps ymm0, ymm12, ymm15 + vfmadd231ps ymm1, ymm13, ymm15 + vfmadd231ps ymm2, ymm14, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 4] + + vfmadd231ps ymm3, ymm12, ymm15 + vfmadd231ps ymm4, ymm13, ymm15 + vfmadd231ps ymm5, ymm14, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 8] + + vfmadd231ps ymm6, ymm12, ymm15 + vfmadd231ps ymm7, ymm13, ymm15 + vfmadd231ps ymm8, ymm14, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 12] + + vfmadd231ps ymm9, ymm12, ymm15 + vfmadd231ps ymm10, ymm13, ymm15 + vfmadd231ps ymm11, ymm14, ymm15 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx-unroll.tmpli new file mode 100644 index 000000000..faaf1ba4d --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx-unroll.tmpli @@ -0,0 +1,69 @@ + // Tile size: 4x3 + // Accumulators: 0-11 + // Col regs: ymm12 + // Row regs: ymm13-15 + + // Load col of A + vmovaps ymm12, [rax] + + // Fill 3 cols of B + vbroadcastss ymm13, dword ptr [rcx + 0] + vbroadcastss ymm14, dword ptr [rcx + 4] + vbroadcastss ymm15, dword ptr [rcx + 8] + + // N.B. Stepping cols in inner loop + vfmadd231ps ymm0, ymm12, ymm13 + vfmadd231ps ymm4, ymm12, ymm14 + vfmadd231ps ymm8, ymm12, ymm15 + + vmovaps ymm12, [rax+32] + + vfmadd231ps ymm1, ymm12, ymm13 + vfmadd231ps ymm5, ymm12, ymm14 + vfmadd231ps ymm9, ymm12, ymm15 + + vmovaps ymm12, [rax+64] + + vfmadd231ps ymm2, ymm12, ymm13 + vfmadd231ps ymm6, ymm12, ymm14 + vfmadd231ps ymm10, ymm12, ymm15 + + vmovaps ymm12, [rax+96] + + vfmadd231ps ymm3, ymm12, ymm13 + vfmadd231ps ymm7, ymm12, ymm14 + vfmadd231ps ymm11, ymm12, ymm15 + + // Load col of A, switching col! + vmovaps ymm13, [rax + 128] + + // Fill 3 cols of B + vbroadcastss ymm14, dword ptr [rcx + 12] + vbroadcastss ymm15, dword ptr [rcx + 16] + vbroadcastss ymm12, dword ptr [rcx + 20] + + // N.B. Stepping cols in inner loop + vfmadd231ps ymm0, ymm13, ymm14 + vfmadd231ps ymm4, ymm13, ymm15 + vfmadd231ps ymm8, ymm13, ymm12 + + vmovaps ymm13, [rax + 160] + + vfmadd231ps ymm1, ymm13, ymm14 + vfmadd231ps ymm5, ymm13, ymm15 + vfmadd231ps ymm9, ymm13, ymm12 + + vmovaps ymm13, [rax + 192] + + vfmadd231ps ymm2, ymm13, ymm14 + vfmadd231ps ymm6, ymm13, ymm15 + vfmadd231ps ymm10, ymm13, ymm12 + + vmovaps ymm13, [rax + 224] + + vfmadd231ps ymm3, ymm13, ymm14 + vfmadd231ps ymm7, ymm13, ymm15 + vfmadd231ps ymm11, ymm13, ymm12 + + add rcx, 24 + add rax, 256 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx.tmpli new file mode 100644 index 000000000..2a6b43203 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx.tmpli @@ -0,0 +1,38 @@ + // Tile size: 4x3 + // Accumulators: 0-11 + // Col regs: ymm12 + // Row regs: ymm13-15 + + // Load col of A + vmovaps ymm12, [rax] + + // Fill 3 cols of B + vbroadcastss ymm13, dword ptr [rcx + 0] + vbroadcastss ymm14, dword ptr [rcx + 4] + vbroadcastss ymm15, dword ptr [rcx + 8] + + // N.B. Stepping cols in inner loop + vfmadd231ps ymm0, ymm12, ymm13 + vfmadd231ps ymm4, ymm12, ymm14 + vfmadd231ps ymm8, ymm12, ymm15 + + vmovaps ymm12, [rax+32] + + vfmadd231ps ymm1, ymm12, ymm13 + vfmadd231ps ymm5, ymm12, ymm14 + vfmadd231ps ymm9, ymm12, ymm15 + + vmovaps ymm12, [rax+64] + + vfmadd231ps ymm2, ymm12, ymm13 + vfmadd231ps ymm6, ymm12, ymm14 + vfmadd231ps ymm10, ymm12, ymm15 + + vmovaps ymm12, [rax+96] + + vfmadd231ps ymm3, ymm12, ymm13 + vfmadd231ps ymm7, ymm12, ymm14 + vfmadd231ps ymm11, ymm12, ymm15 + + add rcx, 12 + add rax, 128 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx-unroll.tmpli new file mode 100644 index 000000000..932763061 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx-unroll.tmpli @@ -0,0 +1,63 @@ + // Tile size: 5x2 + // Accumulators: 0-9 + // Col regs: ymm10-13 + // Row regs: ymm14-15 + + vmovaps ymm10, [rax] + vbroadcastss ymm14, dword ptr [rcx + 0] + vbroadcastss ymm15, dword ptr [rcx + 4] + vmovaps ymm11, [rax + 32] + + // NB stepping column-wise + vfmadd231ps ymm0, ymm10, ymm14 + vfmadd231ps ymm5, ymm10, ymm15 + + vmovaps ymm12, [rax + 64] + + vfmadd231ps ymm1, ymm11, ymm14 + vfmadd231ps ymm6, ymm11, ymm15 + + vmovaps ymm13, [rax + 96] + + vfmadd231ps ymm2, ymm12, ymm14 + vfmadd231ps ymm7, ymm12, ymm15 + + vmovaps ymm10, [rax + 128] + + vfmadd231ps ymm3, ymm13, ymm14 + vfmadd231ps ymm8, ymm13, ymm15 + + vmovaps ymm11, [rax + 160] + + vfmadd231ps ymm4, ymm10, ymm14 + vfmadd231ps ymm9, ymm10, ymm15 + + vbroadcastss ymm14, dword ptr [rcx + 8] + vbroadcastss ymm15, dword ptr [rcx + 12] + + vmovaps ymm12, [rax + 192] + + // NB stepping column-wise + vfmadd231ps ymm0, ymm11, ymm14 + vfmadd231ps ymm5, ymm11, ymm15 + + vmovaps ymm13, [rax + 224] + + vfmadd231ps ymm1, ymm12, ymm14 + vfmadd231ps ymm6, ymm12, ymm15 + + vmovaps ymm10, [rax + 256] + + vfmadd231ps ymm2, ymm13, ymm14 + vfmadd231ps ymm7, ymm13, ymm15 + + vmovaps ymm11, [rax + 288] + + vfmadd231ps ymm3, ymm10, ymm14 + vfmadd231ps ymm8, ymm10, ymm15 + + vfmadd231ps ymm4, ymm11, ymm14 + vfmadd231ps ymm9, ymm11, ymm15 + + add rax, 320 + add rcx, 16 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx.tmpli new file mode 100644 index 000000000..add37cea1 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx.tmpli @@ -0,0 +1,34 @@ + // Tile size: 5x2 + // Accumulators: 0-9 + // Col regs: ymm10-13 + // Row regs: ymm14-15 + + vmovaps ymm10, [rax] + vbroadcastss ymm14, dword ptr [rcx + 0] + vbroadcastss ymm15, dword ptr [rcx + 4] + vmovaps ymm11, [rax + 32] + + // NB stepping column-wise + vfmadd231ps ymm0, ymm10, ymm14 + vfmadd231ps ymm5, ymm10, ymm15 + + vmovaps ymm12, [rax + 64] + + vfmadd231ps ymm1, ymm11, ymm14 + vfmadd231ps ymm6, ymm11, ymm15 + + vmovaps ymm13, [rax + 96] + + vfmadd231ps ymm2, ymm12, ymm14 + vfmadd231ps ymm7, ymm12, ymm15 + + vmovaps ymm11, [rax + 128] + + vfmadd231ps ymm3, ymm13, ymm14 + vfmadd231ps ymm8, ymm13, ymm15 + + vfmadd231ps ymm4, ymm11, ymm14 + vfmadd231ps ymm9, ymm11, ymm15 + + add rax, 160 + add rcx, 8 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx-unroll.tmpli new file mode 100644 index 000000000..0d5f7382e --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx-unroll.tmpli @@ -0,0 +1,25 @@ + // Tile size: 6x1 + // Accumulators: 0-5 + // Col regs: 6-11 + // Row regs: 15 + + + vbroadcastss ymm15, dword ptr [rcx] + vfmadd231ps ymm0, ymm15, [rax] + vfmadd231ps ymm1, ymm15, [rax + 32] + vfmadd231ps ymm2, ymm15, [rax + 64] + vfmadd231ps ymm3, ymm15, [rax + 96] + vfmadd231ps ymm4, ymm15, [rax + 128] + vfmadd231ps ymm5, ymm15, [rax + 160] + + vbroadcastss ymm14, dword ptr [rcx + 4] + + vfmadd231ps ymm0, ymm14, [rax + 192] + vfmadd231ps ymm1, ymm14, [rax + 224] + vfmadd231ps ymm2, ymm14, [rax + 256] + vfmadd231ps ymm3, ymm14, [rax + 288] + vfmadd231ps ymm4, ymm14, [rax + 320] + vfmadd231ps ymm5, ymm14, [rax + 352] + + add rax, 384 + add rcx, 8 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx.tmpli new file mode 100644 index 000000000..b9eb475e8 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx.tmpli @@ -0,0 +1,29 @@ + // Tile size: 6x1 + // Accumulators: 0-5 + // Col regs: 6-11 + // Row regs: 15 + + vbroadcastss ymm15, dword ptr [rcx] + + vmovups ymm10, [rax] + vmulps ymm10, ymm10, ymm15 + vaddps ymm0, ymm0, ymm10 + vmovups ymm11, [rax + 32] + vmulps ymm11, ymm11, ymm15 + vaddps ymm1, ymm1, ymm11 + vmovups ymm12, [rax + 64] + vmulps ymm12, ymm12, ymm15 + vaddps ymm2, ymm2, ymm12 + vmovups ymm13, [rax + 96] + vmulps ymm13, ymm13, ymm15 + vaddps ymm3, ymm3, ymm13 + vmovups ymm14, [rax + 128] + vmulps ymm14, ymm14, ymm15 + vaddps ymm4, ymm4, ymm14 + vmovups ymm15, [rax + 160] + vmulps ymm15, ymm15, ymm15 + vaddps ymm5, ymm5, ymm15 + + + add rcx, 4 + add rax, 192 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx-unroll.tmpli new file mode 100644 index 000000000..885e84add --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx-unroll.tmpli @@ -0,0 +1,70 @@ + // Tile size: 6x2 + // Accumulators: 0-9 + // Col regs: ymm10-13 + // Row regs: ymm14-15 + + vmovaps ymm12, [rax] + vbroadcastss ymm14, dword ptr [rcx + 0] + vbroadcastss ymm15, dword ptr [rcx + 4] + vmovaps ymm13, [rax + 32] + + vfmadd231ps ymm0, ymm12, ymm14 + vfmadd231ps ymm6, ymm12, ymm15 + + vmovaps ymm12, [rax + 64] + + vfmadd231ps ymm1, ymm13, ymm14 + vfmadd231ps ymm7, ymm13, ymm15 + + vmovaps ymm13, [rax + 96] + + vfmadd231ps ymm2, ymm12, ymm14 + vfmadd231ps ymm8, ymm12, ymm15 + + vmovaps ymm12, [rax + 128] + + vfmadd231ps ymm3, ymm13, ymm14 + vfmadd231ps ymm9, ymm13, ymm15 + + vmovaps ymm13, [rax + 160] + + vfmadd231ps ymm4, ymm12, ymm14 + vfmadd231ps ymm10, ymm12, ymm15 + + vmovaps ymm12, [rax + 192] + vbroadcastss ymm14, dword ptr [rcx + 8] + + vfmadd231ps ymm5, ymm13, ymm14 + vfmadd231ps ymm11, ymm13, ymm15 + + vbroadcastss ymm15, dword ptr [rcx + 12] + vmovaps ymm13, [rax + 224] + + vfmadd231ps ymm0, ymm12, ymm14 + vfmadd231ps ymm6, ymm12, ymm15 + + vmovaps ymm12, [rax + 256] + + vfmadd231ps ymm1, ymm13, ymm14 + vfmadd231ps ymm7, ymm13, ymm15 + + vmovaps ymm13, [rax + 288] + + vfmadd231ps ymm2, ymm12, ymm14 + vfmadd231ps ymm8, ymm12, ymm15 + + vmovaps ymm12, [rax + 320] + + vfmadd231ps ymm3, ymm13, ymm14 + vfmadd231ps ymm9, ymm13, ymm15 + + vmovaps ymm13, [rax + 352] + + vfmadd231ps ymm4, ymm12, ymm14 + vfmadd231ps ymm10, ymm12, ymm15 + + vfmadd231ps ymm5, ymm13, ymm14 + vfmadd231ps ymm11, ymm13, ymm15 + + add rax, 384 + add rcx, 16 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx.tmpli new file mode 100644 index 000000000..df8d6f19f --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx.tmpli @@ -0,0 +1,38 @@ + // Tile size: 6x2 + // Accumulators: 0-11 + // Col regs: 12-13 + // Row regs: 14-15 + + vmovaps ymm12, [rax] + vbroadcastss ymm14, dword ptr [rcx + 0] + vbroadcastss ymm15, dword ptr [rcx + 4] + vmovaps ymm13, [rax + 32] + + vfmadd231ps ymm0, ymm12, ymm14 + vfmadd231ps ymm6, ymm12, ymm15 + + vmovaps ymm12, [rax + 64] + + vfmadd231ps ymm1, ymm13, ymm14 + vfmadd231ps ymm7, ymm13, ymm15 + + vmovaps ymm13, [rax + 96] + + vfmadd231ps ymm2, ymm12, ymm14 + vfmadd231ps ymm8, ymm12, ymm15 + + vmovaps ymm12, [rax + 128] + + vfmadd231ps ymm3, ymm13, ymm14 + vfmadd231ps ymm9, ymm13, ymm15 + + vmovaps ymm13, [rax + 160] + + vfmadd231ps ymm4, ymm12, ymm14 + vfmadd231ps ymm10, ymm12, ymm15 + + vfmadd231ps ymm5, ymm13, ymm14 + vfmadd231ps ymm11, ymm13, ymm15 + + add rcx, 8 + add rax, 192 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx-unroll.tmpli new file mode 100644 index 000000000..0c52cbac7 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx-unroll.tmpli @@ -0,0 +1,37 @@ + // Tile size: 6x1 + // Accumulators: 0-5 + // Col regs: 6-11 + // Row regs: 15 + vbroadcastss ymm15, dword ptr [rcx] + + vmovaps ymm6, [rax + 0] + vmovaps ymm7, [rax + 32] + vmovaps ymm8, [rax + 64] + vmovaps ymm9, [rax + 96] + + vfmadd231ps ymm0, ymm6, ymm15 + vmovaps ymm10, [rax + 128] + + vfmadd231ps ymm1, ymm7, ymm15 + vmovaps ymm11, [rax + 160] + vfmadd231ps ymm2, ymm8, ymm15 + vbroadcastss ymm14, dword ptr [rcx+4] + vfmadd231ps ymm3, ymm9, ymm15 + vmovaps ymm12, [rax + 192] + vfmadd231ps ymm4, ymm10, ymm15 + vmovaps ymm13, [rax + 224] + vfmadd231ps ymm5, ymm11, ymm15 + + vmovaps ymm6, [rax + 256] + vfmadd231ps ymm0, ymm12, ymm14 + vmovaps ymm7, [rax + 288] + vfmadd231ps ymm1, ymm13, ymm14 + + vmovaps ymm8, [rax + 128] + vfmadd231ps ymm2, ymm6, ymm14 + + vmovaps ymm9, [rax + 160] + vfmadd231ps ymm3, ymm7, ymm14 + + vfmadd231ps ymm4, ymm8, ymm14 + vfmadd231ps ymm5, ymm9, ymm14 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx.tmpli new file mode 100644 index 000000000..95cd32307 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx.tmpli @@ -0,0 +1,22 @@ + // Tile size: 6x1 + // Accumulators: 0-5 + // Col regs: 6-11 + // Row regs: 15 + vbroadcastss ymm15, dword ptr [rcx] + + vmovaps ymm6, [rax + 0] + vmovaps ymm7, [rax + 32] + vmovaps ymm8, [rax + 64] + vmovaps ymm9, [rax + 96] + + vfmadd231ps ymm0, ymm6, ymm15 + vfmadd231ps ymm1, ymm7, ymm15 + + vmovaps ymm10, [rax + 128] + vfmadd231ps ymm2, ymm8, ymm15 + + vmovaps ymm11, [rax + 160] + vfmadd231ps ymm3, ymm9, ymm15 + + vfmadd231ps ymm4, ymm10, ymm15 + vfmadd231ps ymm5, ymm11, ymm15 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx-unroll.tmpli new file mode 100644 index 000000000..2348b2f72 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx-unroll.tmpli @@ -0,0 +1,48 @@ + // Accumulators: 0-7 + // Columns: 14-15 + // Rows: 8-13 + + + vbroadcastss ymm15, dword ptr [rcx] + vbroadcastss ymm14, dword ptr [rcx + 4] + + vmovaps ymm8, [rax] + vmovaps ymm9, [rax + 32] + vmovaps ymm10, [rax + 64] + vmovaps ymm11, [rax + 96] + vmovaps ymm12, [rax + 128] + vmovaps ymm13, [rax + 160] + + vfmadd231ps ymm0, ymm15, ymm8 + vfmadd231ps ymm1, ymm15, ymm9 + vfmadd231ps ymm2, ymm15, ymm10 + vfmadd231ps ymm3, ymm15, ymm11 + vfmadd231ps ymm4, ymm15, ymm12 + vfmadd231ps ymm5, ymm15, ymm13 + + vmovaps ymm8, [rax + 192] + vmovaps ymm9, [rax + 224] + vmovaps ymm10, [rax + 256] + vmovaps ymm11, [rax + 288] + vmovaps ymm12, [rax + 320] + vmovaps ymm13, [rax + 352] + + vfmadd231ps ymm6, ymm15, ymm8 + vfmadd231ps ymm7, ymm15, ymm9 + vfmadd231ps ymm0, ymm14, ymm10 + vfmadd231ps ymm1, ymm14, ymm11 + vfmadd231ps ymm2, ymm14, ymm12 + vfmadd231ps ymm3, ymm14, ymm13 + + vmovaps ymm8, [rax + 384] + vmovaps ymm9, [rax + 416] + vmovaps ymm10, [rax + 448] + vmovaps ymm11, [rax + 480] + + vfmadd231ps ymm4, ymm14, ymm8 + vfmadd231ps ymm5, ymm14, ymm9 + vfmadd231ps ymm6, ymm14, ymm10 + vfmadd231ps ymm7, ymm14, ymm11 + + add rcx, 8 + add rax, 512 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx.tmpli new file mode 100644 index 000000000..c170e664a --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx.tmpli @@ -0,0 +1,33 @@ + // Tile size: 8x1 + // Accumulators: 0-7 + // Col regs: 8-14 + // Row regs: 15 + + vbroadcastss ymm15, dword ptr [rcx] + + vmovaps ymm8, [rax + 0] + vmovaps ymm9, [rax + 32] + vmovaps ymm10, [rax + 64] + vmovaps ymm11, [rax + 96] + + vfmadd231ps ymm0, ymm8, ymm15 + vfmadd231ps ymm1, ymm9, ymm15 + + vmovaps ymm12, [rax + 128] + vmovaps ymm13, [rax + 160] + + vfmadd231ps ymm2, ymm10, ymm15 + vfmadd231ps ymm3, ymm11, ymm15 + + vmovaps ymm14, [rax + 192] + vmovaps ymm11, [rax + 224] + + vfmadd231ps ymm4, ymm12, ymm15 + vfmadd231ps ymm5, ymm13, ymm15 + + + vfmadd231ps ymm6, ymm14, ymm15 + vfmadd231ps ymm7, ymm11, ymm15 + + add rcx, 4 + add rax, 256 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx-unroll.tmpli new file mode 100644 index 000000000..f8e819336 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx-unroll.tmpli @@ -0,0 +1,58 @@ + // Tile size: 1x8 + // Accumulators: 0-7 + // Col regs: 8-14 + // Row regs: 15 + + + vmovaps ymm15, [rax] + + vbroadcastss ymm8, dword ptr [rcx + 0 * 4] + vfmadd231ps ymm0, ymm15, ymm8 + + vbroadcastss ymm9, dword ptr [rcx + 1 * 4] + vfmadd231ps ymm1, ymm15, ymm9 + + vbroadcastss ymm10, dword ptr [rcx + 2 * 4] + vfmadd231ps ymm2, ymm15, ymm10 + + vbroadcastss ymm11, dword ptr [rcx + 3 * 4] + vfmadd231ps ymm3, ymm15, ymm11 + + vbroadcastss ymm12, dword ptr [rcx + 4 * 4] + vfmadd231ps ymm4, ymm15, ymm12 + + vbroadcastss ymm13, dword ptr [rcx + 5 * 4] + vfmadd231ps ymm5, ymm15, ymm13 + + vbroadcastss ymm10, dword ptr [rcx + 6 * 4] + vfmadd231ps ymm6, ymm15, ymm10 + + vbroadcastss ymm11, dword ptr [rcx + 7 * 4] + vfmadd231ps ymm7, ymm15, ymm11 + + + vmovaps ymm15, [rax] + + vbroadcastss ymm8, dword ptr [rcx + 0 * 4] + vfmadd231ps ymm0, ymm15, ymm8 + + vbroadcastss ymm9, dword ptr [rcx + 1 * 4] + vfmadd231ps ymm1, ymm15, ymm9 + + vbroadcastss ymm10, dword ptr [rcx + 2 * 4] + vfmadd231ps ymm2, ymm15, ymm10 + + vbroadcastss ymm11, dword ptr [rcx + 3 * 4] + vfmadd231ps ymm3, ymm15, ymm11 + + vbroadcastss ymm12, dword ptr [rcx + 4 * 4] + vfmadd231ps ymm4, ymm15, ymm12 + + vbroadcastss ymm13, dword ptr [rcx + 5 * 4] + vfmadd231ps ymm5, ymm15, ymm13 + + vbroadcastss ymm10, dword ptr [rcx + 6 * 4] + vfmadd231ps ymm6, ymm15, ymm10 + + vbroadcastss ymm11, dword ptr [rcx + 7 * 4] + vfmadd231ps ymm7, ymm15, ymm11 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx.tmpli new file mode 100644 index 000000000..1af4afecc --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx.tmpli @@ -0,0 +1,30 @@ + // Tile size: 1x8 + // Accumulators: 0-7 + // Col regs: 8-14 + // Row regs: 15 + + vmovaps ymm15, [rax] + + vbroadcastss ymm8, dword ptr [rcx + 0 * 4] + vfmadd231ps ymm0, ymm15, ymm8 + + vbroadcastss ymm9, dword ptr [rcx + 1 * 4] + vfmadd231ps ymm1, ymm15, ymm9 + + vbroadcastss ymm10, dword ptr [rcx + 2 * 4] + vfmadd231ps ymm2, ymm15, ymm10 + + vbroadcastss ymm11, dword ptr [rcx + 3 * 4] + vfmadd231ps ymm3, ymm15, ymm11 + + vbroadcastss ymm12, dword ptr [rcx + 4 * 4] + vfmadd231ps ymm4, ymm15, ymm12 + + vbroadcastss ymm13, dword ptr [rcx + 5 * 4] + vfmadd231ps ymm5, ymm15, ymm13 + + vbroadcastss ymm10, dword ptr [rcx + 6 * 4] + vfmadd231ps ymm6, ymm15, ymm10 + + vbroadcastss ymm11, dword ptr [rcx + 7 * 4] + vfmadd231ps ymm7, ymm15, ymm11 diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/avx2_mmm_i32_8x8.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/avx2_mmm_i32_8x8.tmpl new file mode 100644 index 000000000..70c7ba85c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/avx2_mmm_i32_8x8.tmpl @@ -0,0 +1,682 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 8x8: + + ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% if msvc %} + +_text segment +avx2_mmm_i32_8x8_{{suffix}} proc + +{% else %} + +.intel_syntax noprefix +.text +.p2align 5 +.globl {{G}}avx2_mmm_i32_8x8_{{suffix}} +{{G}}avx2_mmm_i32_8x8_{{suffix}}: +.cfi_startproc + +{% endif %} + + push rbp + mov rbp, rsp + +{% if family == "windows" %} +// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch +// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers + and rsp,-16 + lea rsp,[rsp-160] + vmovaps [rsp], xmm6 + vmovaps [rsp+16*1],xmm7 + vmovaps [rsp+16*2],xmm8 + vmovaps [rsp+16*3],xmm9 + vmovaps [rsp+16*4],xmm10 + vmovaps [rsp+16*5],xmm11 + vmovaps [rsp+16*6],xmm12 + vmovaps [rsp+16*7],xmm13 + vmovaps [rsp+16*8],xmm14 + vmovaps [rsp+16*9],xmm15 + + push rdi + push rsi + + mov rdi, rcx + +{% endif %} + + push rbx + push r12 + push r13 + push r14 + push r15 + + sub rsp, 8 + +{% if family == "unix" %} +.cfi_def_cfa_offset 64 +{% endif %} + + stmxcsr [rsp + 4] +{% if msvc %} + mov rax, 1FC0h +{% else %} + mov rax, 0x1FC0 +{% endif %} + mov [rsp], eax + ldmxcsr [rsp] + +{% include "dispatcher.tmpliq" %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov r12, [rdi + 32] // packing + mov rbx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rcx, [rdi + 8] // k + test rcx, rcx + jz {{L}}non_linear_loop + + cmp r12, 1 + je {{L}}main_loop_packed_packed_i8i8 + +{{L}}main_loop_packed_packed: + vmovaps ymm12, [rax] + + {% for i in (0..7) %} + vbroadcastss ymm14, dword ptr [rbx + {{i}} * 4] + vpmulld ymm13, ymm12, ymm14 + vpaddd ymm{{i}}, ymm{{i}}, ymm13 + {% endfor %} + + add rax, 32 + add rbx, 32 + dec rcx + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +{{L}}main_loop_packed_packed_i8i8: + movq xmm8, qword ptr [rax] // read 8 bytes + vpmovsxbw ymm8, xmm8 // promote byte to i32x8 + + vpbroadcastb ymm9, byte ptr [rbx] // broadcast 1 byte from B + vpbroadcastb ymm10, byte ptr [rbx + 1] // broadcast 1 byte from B + vpbroadcastb ymm11, byte ptr [rbx + 2] // broadcast 1 byte from B + vpbroadcastb ymm12, byte ptr [rbx + 3] // broadcast 1 byte from B + vpmovsxbw ymm9, xmm9 // promote byte to i32x8 + vpmovsxbw ymm10, xmm10 // promote byte to i32x8 + vpmovsxbw ymm11, xmm11 // promote byte to i32x8 + vpmovsxbw ymm12, xmm12 // promote byte to i32x8 + + vpmullw ymm9, ymm9, ymm8 + vpmullw ymm10, ymm10, ymm8 + vpmullw ymm11, ymm11, ymm8 + vpmullw ymm12, ymm12, ymm8 + vpmovsxwd ymm9, xmm9 // promote byte to i32x8 + vpmovsxwd ymm10, xmm10 // promote byte to i32x8 + vpmovsxwd ymm11, xmm11 // promote byte to i32x8 + vpmovsxwd ymm12, xmm12 // promote byte to i32x8 + vpaddd ymm0, ymm0, ymm9 + vpaddd ymm1, ymm1, ymm10 + vpaddd ymm2, ymm2, ymm11 + vpaddd ymm3, ymm3, ymm12 + + vpbroadcastb ymm9, byte ptr [rbx + 4] + vpbroadcastb ymm10, byte ptr [rbx + 5] + vpbroadcastb ymm11, byte ptr [rbx + 6] + vpbroadcastb ymm12, byte ptr [rbx + 7] + vpmovsxbw ymm9, xmm9 + vpmovsxbw ymm10, xmm10 + vpmovsxbw ymm11, xmm11 + vpmovsxbw ymm12, xmm12 + + vpmullw ymm9, ymm9, ymm8 + vpmullw ymm10, ymm10, ymm8 + vpmullw ymm11, ymm11, ymm8 + vpmullw ymm12, ymm12, ymm8 + vpmovsxwd ymm9, xmm9 // promote byte to i32x8 + vpmovsxwd ymm10, xmm10 // promote byte to i32x8 + vpmovsxwd ymm11, xmm11 // promote byte to i32x8 + vpmovsxwd ymm12, xmm12 // promote byte to i32x8 + vpaddd ymm4, ymm4, ymm9 + vpaddd ymm5, ymm5, ymm10 + vpaddd ymm6, ymm6, ymm11 + vpaddd ymm7, ymm7, ymm12 + + add rbx, 8 + add rax, 8 + dec rcx + jnz {{L}}main_loop_packed_packed_i8i8 + + jmp {{L}}non_linear_loop + +{% include "fma_mmm_i32_scalars.tmpliq" from:0, to:7 %} +{% include "fma_mmm_i32_per_rows.tmpliq" mr:8,from:0, to:7 %} +{% include "fma_mmm_i32_per_cols.tmpliq" mr:8,from:0, to:7 %} +{% include "fma_mmm_load_tile.tmpliq" from:0, to:7 %} + +{{L}}add_unicast: + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + mov r8, [rdi + 32] // item size + + cmp r8, 4 + je {{L}}non_linear_addc_i32 + +{% comment %} +// This is not great as vgatherdps reads 32-bits values and goes beyond our buffer. Probably harmless though. +// Commented and replaced with the "mov al" loop beyond to pacify valgrind. +// ymm14 and ymm15 are the same as in the non_linear_addc_i32 case (compute them before the test right above here. +// {% for i in (0..7) %} +// vpcmpeqd ymm15, ymm15, ymm15 +// vgatherdps ymm12, [ r10 + ymm14 ], ymm15 // 0xxx 1xxx 2xxx 3xxx 4xxx 5xxx 6xxx 7xxx +// +// // we need to go through vpmovsxbd, shuffling naively erases signs +// vpshufb ymm12, ymm12, ymm10 // 0123 0123 0123 0123 4567 4567 4567 4567 +// +// vpermd ymm12, ymm11, ymm12 // 0123 4567 +// vpmovsxbd ymm12, xmm12 // sign extend +// +// vpaddd ymm{{i}}, ymm{{i}}, ymm12 +// add r10, rbx +// {% endfor %} +{% endcomment %} + + {% for col in (0..7) %} + mov r8, r10 + {% for half in (0..1) %} + {% for lane in (0..3) %} + mov al, [ r8 ] + add r8, rsi + movsx eax, al + pinsrd xmm10, eax, {{lane}} + {% endfor %} + vperm2f128 ymm10, ymm10, ymm10, 1 + {% endfor %} + vpaddd ymm{{col}}, ymm{{col}}, ymm10 + add r10, rbx + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}non_linear_addc_i32: + + mov eax, 0 +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} + vpermq ymm14, ymm14, 78 // 0b01001110 +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} + vpermq ymm14, ymm14, 78 // 0b01001110 + + +{% if msvc %} + vpbroadcastd ymm10, dword ptr [ offset byte_shuffle ] + vmovups ymm11, dword ptr [ offset i128_shuffle ] +{% else %} + vpbroadcastd ymm10, [ rip + {{L}}byte_shuffle ] + vmovups ymm11, [ rip + {{L}}i128_shuffle ] +{% endif %} + +{% for i in (0..7) %} + vpcmpeqd ymm15, ymm15, ymm15 + vgatherdps ymm12, [ r10 + ymm14 ], ymm15 + vpaddd ymm{{i}}, ymm{{i}}, ymm12 + add r10, rbx +{% endfor %} + + jmp {{L}}non_linear_loop + +{% if msvc %} +.data +byte_shuffle dd 201851904 // 0x0c080400 +i128_shuffle dd 0, 4 +.code +{% else %} +{{L}}byte_shuffle: .int 201851904 // 0x0c080400 +{{L}}i128_shuffle: .int 0, 4 +{% endif %} + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vmovups ymm12, [rax] + +{% for i in (0..7) %} + vbroadcastss ymm14, dword ptr [rbx + {{i|times:4}} ] + vpmulld ymm15, ymm12, ymm14 + vpaddd ymm{{i}}, ymm{{i}}, ymm15 +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}q_scale: + mov r8, [ rdi + 16 ] // policy + vbroadcastss ymm8, dword ptr [rdi + 24] // multi + + mov rax, 1 + movq xmm9, rax + vpbroadcastq ymm9, xmm9 // ymm9 <- 1 + + mov rax, [ rdi + 8 ] // xmm10 <- shift + 31 + add rax, 31 + movq xmm10, rax + vpbroadcastq ymm10, xmm10 + + mov rax, 1 + movq xmm11, rax + vpsubq ymm12, ymm10, ymm9 // shift+31 - 1 + vpsllq ymm11, ymm9, xmm12 // ymm11 <- 1 << (shift + 31 - 1) + + cmp r8, 1 + je {{L}}q_scale_rounding_zero + cmp r8, 2 + je {{L}}q_scale_rounding_away + cmp r8, 3 + je {{L}}q_scale_rounding_minus_inf + cmp r8, 4 + je {{L}}q_scale_rounding_plus_inf + cmp r8, 5 + je {{L}}q_scale_rounding_even + cmp r8, 6 + je {{L}}q_scale_rounding_odd + + jmp {{L}}unsupported + +{{L}}q_scale_rounding_zero: // signum * ( (abs + nudge) >> shift ) +{% for i in (0..7) %} + vpabsd ymm14, ymm{{i}} + vpsrldq ymm15, ymm14, 4 // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0 + vpmuldq ymm14, ymm14, ymm8 // ymm14 <- a0*c, a2*c, a4*c, a6*c + vpmuldq ymm15, ymm15, ymm8 // ymm15 <- a1*c, a3*c, a5*c, a7*c + + vpaddq ymm14, ymm14, ymm11 + vpaddq ymm15, ymm15, ymm11 + + vpsubq ymm14, ymm14, ymm9 + vpsubq ymm15, ymm15, ymm9 + + vpsrlq ymm14, ymm14, xmm10 + vpsrlq ymm15, ymm15, xmm10 + + vpslldq ymm15, ymm15, 4 + vpblendd ymm14, ymm15, ymm14, 85 // 0x55 + vpsignd ymm{{i}}, ymm14, ymm{{i}} +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}q_scale_rounding_away: // signum * ( (abs + nudge) >> shift ) +{% for i in (0..7) %} + vpabsd ymm14, ymm{{i}} + vpsrldq ymm15, ymm14, 4 // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0 + vpmuldq ymm14, ymm14, ymm8 // ymm14 <- a0*c, a2*c, a4*c, a6*c + vpmuldq ymm15, ymm15, ymm8 // ymm15 <- a1*c, a3*c, a5*c, a7*c + + vpaddq ymm14, ymm14, ymm11 + vpaddq ymm15, ymm15, ymm11 + + vpsrlq ymm14, ymm14, xmm10 + vpsrlq ymm15, ymm15, xmm10 + + vpslldq ymm15, ymm15, 4 + vpblendd ymm14, ymm15, ymm14, 85 // 0x55 + vpsignd ymm{{i}}, ymm14, ymm{{i}} +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}q_scale_rounding_minus_inf: // signum * ( (abs << 32 + 1<<30+shift) >> shift ) +{% for i in (0..7) %} + vpabsd ymm14, ymm{{i}} + // sign extract for nudging in the right direction + vpxor ymm13, ymm13, ymm13 + vpcmpgtd ymm13, ymm{{i}}, ymm13 // ymm13 <- s0, s1, ..s8 (signums, as all ones or all zeros) + vpsrld ymm13, ymm13, 31 // then just 0 or 1 + + vpsrldq ymm15, ymm14, 4 // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0 + vpmuldq ymm14, ymm14, ymm8 // ymm14 <- a0*c, a2*c, a4*c, a6*c + vpmuldq ymm15, ymm15, ymm8 // ymm15 <- a1*c, a3*c, a5*c, a7*c + + vpaddq ymm14, ymm14, ymm11 + vpaddq ymm15, ymm15, ymm11 + + // reinterpret ymm13=s0i32..s7 as i64 and blend with zero to pick the even ones as i64 + vpxor ymm12, ymm12, ymm12 + vpblendd ymm12, ymm12, ymm13, 85 // 0x55 + vpsubq ymm14, ymm14, ymm12 + + vpsrldq ymm13, ymm13, 4 // ymm13 <- s1, s2, .., s7, 0 + vpxor ymm12, ymm12, ymm12 + vpblendd ymm12, ymm12, ymm13, 85 // 0x55 + vpsubq ymm15, ymm15, ymm12 + + vpsrlq ymm14, ymm14, xmm10 + vpsrlq ymm15, ymm15, xmm10 + + vpslldq ymm15, ymm15, 4 + vpblendd ymm14, ymm15, ymm14, 85 // 0x55 + vpsignd ymm{{i}}, ymm14, ymm{{i}} +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}q_scale_rounding_plus_inf: // signum * ( (abs << 32 + 1<<30+shift) >> shift ) + + vpbroadcastd ymm9, xmm9 + +{% for i in (0..7) %} + vpabsd ymm14, ymm{{i}} + vpxor ymm13, ymm13, ymm13 + + // sign extract for nudging in the right direction + vpcmpgtd ymm13, ymm{{i}}, ymm13 // ymm13 <- s0, s1, ..s8 (signums, as all ones or all zeros) + vpaddd ymm13, ymm13, ymm9 // if val >= 0 { 0i32 } else { 1i32 } + + vpsrldq ymm15, ymm14, 4 // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0 + vpmuldq ymm14, ymm14, ymm8 // ymm14 <- a0*c, a2*c, a4*c, a6*c + vpmuldq ymm15, ymm15, ymm8 // ymm15 <- a1*c, a3*c, a5*c, a7*c + + vpaddq ymm14, ymm14, ymm11 + vpaddq ymm15, ymm15, ymm11 + + // reinterpret ymm13=s0i32..s7 as i64 and blend with zero to pick the even ones as i64 + vpxor ymm12, ymm12, ymm12 + vpblendd ymm12, ymm12, ymm13, 85 // 0x55 + vpsubq ymm14, ymm14, ymm12 + + vpsrldq ymm13, ymm13, 4 // ymm13 <- s1, s2, .., s7, 0 + vpxor ymm12, ymm12, ymm12 + vpblendd ymm12, ymm12, ymm13, 85 // 0x55 + vpsubq ymm15, ymm15, ymm12 + + vpsrlq ymm14, ymm14, xmm10 + vpsrlq ymm15, ymm15, xmm10 + + vpslldq ymm15, ymm15, 4 + vpblendd ymm14, ymm15, ymm14, 85 // 0x55 + vpsignd ymm{{i}}, ymm14, ymm{{i}} +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}q_scale_rounding_even: // signum * ( (abs + nudge) >> shift ) +{% for i in (0..7) %} + vpabsd ymm14, ymm{{i}} + vpsrldq ymm15, ymm14, 4 // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0 + vpmuldq ymm14, ymm14, ymm8 // ymm14 <- a0*c, a2*c, a4*c, a6*c + vpmuldq ymm15, ymm15, ymm8 // ymm15 <- a1*c, a3*c, a5*c, a7*c + + vpsrlq ymm12, ymm14, xmm10 + vpand ymm12, ymm12, ymm9 + vpaddq ymm14, ymm14, ymm12 + vpsubq ymm14, ymm14, ymm9 + + vpsrlq ymm12, ymm15, xmm10 + vpand ymm12, ymm12, ymm9 + vpaddq ymm15, ymm15, ymm12 + vpsubq ymm15, ymm15, ymm9 + + vpaddq ymm14, ymm14, ymm11 + vpaddq ymm15, ymm15, ymm11 + + vpsrlq ymm14, ymm14, xmm10 + vpsrlq ymm15, ymm15, xmm10 + + vpslldq ymm15, ymm15, 4 + vpblendd ymm14, ymm15, ymm14, 85 // 0x55 + vpsignd ymm{{i}}, ymm14, ymm{{i}} +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}q_scale_rounding_odd: // signum * ( (abs + nudge) >> shift ) +{% for i in (0..7) %} + vpabsd ymm14, ymm{{i}} + vpsrldq ymm15, ymm14, 4 // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0 + vpmuldq ymm14, ymm14, ymm8 // ymm14 <- a0*c, a2*c, a4*c, a6*c + vpmuldq ymm15, ymm15, ymm8 // ymm15 <- a1*c, a3*c, a5*c, a7*c + + vpsrlq ymm12, ymm14, xmm10 + vpand ymm12, ymm12, ymm9 + vpsubq ymm14, ymm14, ymm12 + + vpsrlq ymm12, ymm15, xmm10 + vpand ymm12, ymm12, ymm9 + vpsubq ymm15, ymm15, ymm12 + + vpaddq ymm14, ymm14, ymm11 + vpaddq ymm15, ymm15, ymm11 + + vpsrlq ymm14, ymm14, xmm10 + vpsrlq ymm15, ymm15, xmm10 + + vpslldq ymm15, ymm15, 4 + vpblendd ymm14, ymm15, ymm14, 85 // 0x55 + vpsignd ymm{{i}}, ymm14, ymm{{i}} +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}q_shl: + mov eax, [ rdi + 8 ] // xmm10 <- -shift (8 times) + movd xmm10, eax + vpbroadcastd ymm10, xmm10 + +{% for i in (0..7) %} + vpsllvd ymm{{i}}, ymm{{i}}, ymm10 +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}q_shr: + mov r8, [ rdi + 16 ] // policy + + mov eax, 1 + movd xmm9, eax + vpbroadcastd ymm9, xmm9 // ymm9 <- 1u32 (8 times) + + mov eax, [ rdi + 8 ] // xmm10 <- shift (8 times) + movd xmm10, eax + vpbroadcastd ymm10, xmm10 + + mov ebx, 1 + mov cl, al + sub cl, 1 // rcx <- shift -1 + sal ebx, cl // rbx <- (1 << (shift - 1)) + movd xmm11, ebx + vpbroadcastd ymm11, xmm11 // ymm11 <- "half" + + vpxor ymm12, ymm12, ymm12 // ymm12 <- zeroes + + cmp r8, 1 + je {{L}}q_shr_rounding_zero + cmp r8, 2 + je {{L}}q_shr_rounding_away + cmp r8, 3 + je {{L}}q_shr_rounding_minus_inf + cmp r8, 4 + je {{L}}q_shr_rounding_plus_inf + cmp r8, 5 + je {{L}}q_shr_rounding_even + cmp r8, 6 + je {{L}}q_shr_rounding_odd + + jmp {{L}}unsupported + +{{L}}q_shr_rounding_zero: +{% for i in (0..7) %} + vpabsd ymm14, ymm{{i}} + vpsubd ymm14, ymm14, ymm9 + vpaddd ymm14, ymm14, ymm11 + vpsravd ymm14, ymm14, ymm10 + vpsignd ymm{{i}}, ymm14, ymm{{i}} +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}q_shr_rounding_away: +{% for i in (0..7) %} + vpabsd ymm14, ymm{{i}} + vpaddd ymm14, ymm14, ymm11 + vpsravd ymm14, ymm14, ymm10 + vpsignd ymm{{i}}, ymm14, ymm{{i}} +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}q_shr_rounding_minus_inf: +{% for i in (0..7) %} + vpsubd ymm{{i}}, ymm{{i}}, ymm9 + vpaddd ymm{{i}}, ymm{{i}}, ymm11 + vpsravd ymm{{i}}, ymm{{i}}, ymm10 +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}q_shr_rounding_plus_inf: +{% for i in (0..7) %} + vpaddd ymm{{i}}, ymm{{i}}, ymm11 + vpsravd ymm{{i}}, ymm{{i}}, ymm10 +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}q_shr_rounding_even: +{% for i in (0..7) %} + vpabsd ymm14, ymm{{i}} + vpsravd ymm13, ymm14, ymm10 + vpand ymm13, ymm13, ymm9 + vpsubd ymm13, ymm13, ymm9 // nudge = ((abs >>l shift) & 0x01) - 1 + vpaddd ymm14, ymm14, ymm13 // add nudge + vpaddd ymm14, ymm14, ymm11 // add half + vpsravd ymm14, ymm14, ymm10 + vpsignd ymm{{i}}, ymm14, ymm{{i}} +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}q_shr_rounding_odd: +{% for i in (0..7) %} + vpabsd ymm14, ymm{{i}} + vpsravd ymm13, ymm14, ymm10 + vpand ymm13, ymm13, ymm9 + vpsubd ymm13, ymm12, ymm13 // nudge = - ((abs >>l shift) & 0x01) + vpaddd ymm14, ymm14, ymm13 // add nudge + vpaddd ymm14, ymm14, ymm11 // add half + vpsravd ymm14, ymm14, ymm10 + vpsignd ymm{{i}}, ymm14, ymm{{i}} +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rdx, [rdi + 24] // col stride + mov rcx, [rdi + 32] // item size + + cmp rcx, 4 + je {{L}}store_strides_i32 + + {% for col in (0..7) %} + mov r10, r8 + {% for row in (0..3) %} + extractps ebx, xmm{{col}}, {{row}} + mov byte ptr [r10], bl + add r10, rsi + {% endfor %} + vperm2f128 ymm{{col}}, ymm{{col}}, ymm{{col}}, 1 + {% for row in (0..3) %} + extractps ebx, xmm{{col}}, {{row}} + mov byte ptr [r10], bl + add r10, rsi + {% endfor %} + add r8, rdx + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store_strides_i32: + {% for col in (0..7) %} + mov r10, r8 + {% for row in (0..3) %} + extractps ebx, xmm{{col}}, {{row}} + mov dword ptr [r10], ebx + add r10, rsi + {% endfor %} + vperm2f128 ymm{{col}}, ymm{{col}}, ymm{{col}}, 1 + {% for row in (0..3) %} + extractps ebx, xmm{{col}}, {{row}} + mov dword ptr [r10], ebx + add r10, rsi + {% endfor %} + add r8, rdx + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}return: + ldmxcsr [rsp + 4] + add rsp, 8 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + +{% if family == "windows" %} + pop rsi + pop rdi + + vmovaps xmm15, [rsp+16*9] + vmovaps xmm14, [rsp+16*8] + vmovaps xmm13, [rsp+16*7] + vmovaps xmm12, [rsp+16*6] + vmovaps xmm11, [rsp+16*5] + vmovaps xmm10, [rsp+16*4] + vmovaps xmm9, [rsp+16*3] + vmovaps xmm8, [rsp+16*2] + vmovaps xmm7, [rsp+16*1] + vmovaps xmm6, [rsp] +{% endif %} + + mov rsp, rbp + pop rbp + ret + + +{{L}}one_32bit: +{% if msvc %} + dd 1 +{% else %} + .int 1 +{% endif %} + +{% if msvc %} +avx2_mmm_i32_8x8_{{suffix}} endp +_text ends +end +{% else %} +.cfi_endproc +{% endif %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/dispatcher.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/dispatcher.tmpliq new file mode 100644 index 000000000..1c63f72ad --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/dispatcher.tmpliq @@ -0,0 +1,40 @@ +// vim: set syntax=asm : + +{{L}}non_linear: + +{{L}}non_linear_loop_enter: + sub rdi, 40 +{{L}}non_linear_loop: + add rdi, 40 + mov rax, [rdi] + + mov r8, {{ jump_table | size }} + cmp rax, 0 + cmovl rax, r8 + cmp rax, {{ jump_table | size }} + cmovg rax, r8 + +{% if msvc %} + lea r8, [ offset {{L}}jmp_table ] +{% else %} + lea r8, [ rip + {{L}}jmp_table ] +{% endif %} + movsxd r9, dword ptr [ r8 + rax * 4 ] + lea r8, [ r8 + r9 ] + jmp r8 + +{{L}}jmp_table: +{% for j in jump_table %} + {{long}} {{L}}{{j}}-{{L}}jmp_table +{% endfor %} + {{long}} {{L}}unsupported-{{L}}jmp_table + +{{L}}unsupported: + mov rax, 1 + jmp {{L}}return + + +{{L}}done: + mov rax, 0 + jmp {{L}}return + diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x5.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x5.tmpl new file mode 100644 index 000000000..8c790e11a --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x5.tmpl @@ -0,0 +1,143 @@ +{% comment %} +// vim: set syntax=asm : +/* mmm 16 x 5: + + ymm0 ymm2 ymm4 ymm6 ymm8 + ymm1 ymm3 ymm5 ymm7 ymm9 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" type:"f32", size:"16x5", suffix:suffix, G:G %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rbx, rbx + jz {{L}}non_linear_loop + +{{L}}main_loop_packed_packed: + {% include "2x5/packed_packed_loop1/avx.tmpli" %} + + add rcx, 20 + add rax, 64 + dec rbx + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +// NON LINEAR / ADDC + +{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:9, type:"f32" %} +{% include "fma_mmm_f32_per_rows.tmpliq" mr:16, from:0, to:9, type:"f32" %} +{% include "fma_mmm_f32_per_cols.tmpliq" mr:16, from:0, to:9, type:"f32" %} +{% include "fma_mmm_load_tile.tmpliq" from:0, to:9 %} + +{{L}}add_unicast: + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + mov eax, 0 +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + + lea r8, [ r10 + rsi * 8 ] + +{% for i in (0..4) %} + vpcmpeqd ymm15, ymm15, ymm15 + vgatherdps ymm12, [ r10 + ymm14 ], ymm15 + vpcmpeqd ymm15, ymm15, ymm15 + vgatherdps ymm13, [ r8 + ymm14 ], ymm15 + add r10, rbx + add r8, rbx + vaddps ymm{{i | times:2 }}, ymm{{i | times:2}}, ymm12 + vaddps ymm{{i | times:2 | plus: 1}}, ymm{{i | times:2 | plus:1 }}, ymm13 +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vmovups ymm12, [rax] + vmovups ymm13, [rax + 32] + +{% for i in (0..4) %} + vbroadcastss ymm14, dword ptr [rbx + {{i|times:4}} ] + vfmadd231ps ymm{{i|times:2}}, ymm12, ymm14 + vfmadd231ps ymm{{i|times:2|plus:1}}, ymm13, ymm14 +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r12, [ r8 + 4 * rbx ] + lea r11, [ r10 + rbx ] + cmp rbx, 64 + jne {{L}}store_strides_generic + + {% for row in (0..1) %} + {% for col in (0..4) %} + vmovups ymmword ptr [r{{col|plus:8}}], ymm{{col|times:2|plus:row}} + add r{{col|plus:8}}, 32 + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store_strides_generic: + // tops of cols + + {% for quarter in (0..3) %} + {% if quarter != 0 %} + // move next four rows at top (xmm0,2,..10) + vperm2f128 ymm0, ymm0, ymm1, {{quarter}} + vperm2f128 ymm2, ymm2, ymm3, {{quarter}} + vperm2f128 ymm4, ymm4, ymm5, {{quarter}} + vperm2f128 ymm6, ymm6, ymm7, {{quarter}} + vperm2f128 ymm8, ymm8, ymm9, {{quarter}} + {% endif %} + {% for row in (0..3) %} + {% for i in (0..4) %} + vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | times:2}}, {{row}} + add r{{i | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" type:"f32", size:"16x5", suffix:suffix, G:G, L:L %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x6.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x6.tmpl new file mode 100644 index 000000000..1dae2fde1 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x6.tmpl @@ -0,0 +1,131 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 16 x 6: + + ymm0 ymm2 ymm4 ymm6 ymm8 ymm10 + ymm1 ymm3 ymm5 ymm7 ymm9 ymm11 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" type:"f32", size:"16x6", suffix:suffix, G:G %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rbx, rbx + jz {{L}}non_linear_loop + +{{L}}main_loop_packed_packed: + {% include "2x6/packed_packed_loop1/original.tmpli" %} + + dec rbx + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +// NON LINEAR / ADDC + +{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:11, type:"f32" %} +{% include "fma_mmm_f32_per_rows.tmpliq" mr:16, from:0, to:11, type:"f32" %} +{% include "fma_mmm_f32_per_cols.tmpliq" mr:16, from:0, to:11, type:"f32" %} +{% include "fma_mmm_load_tile.tmpliq" from:0, to:11 %} + +{{L}}add_unicast: + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + mov eax, 0 +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + + lea r8, [ r10 + rsi * 8 ] + +{% for i in (0..5) %} + vpcmpeqd ymm15, ymm15, ymm15 + vgatherdps ymm12, [ r10 + ymm14 ], ymm15 + vpcmpeqd ymm15, ymm15, ymm15 + vgatherdps ymm13, [ r8 + ymm14 ], ymm15 + add r10, rbx + add r8, rbx + vaddps ymm{{i | times:2 }}, ymm{{i | times:2}}, ymm12 + vaddps ymm{{i | times:2 | plus: 1}}, ymm{{i | times:2 | plus:1 }}, ymm13 +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vmovups ymm12, [rax] + vmovups ymm13, [rax + 32] + +{% for i in (0..5) %} + vbroadcastss ymm14, dword ptr [rbx + {{i|times:4}} ] + vfmadd231ps ymm{{i|times:2}}, ymm12, ymm14 + vfmadd231ps ymm{{i|times:2|plus:1}}, ymm13, ymm14 +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + // tops of cols + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r12, [ r8 + 4 * rbx ] + lea r11, [ r10 + rbx ] + lea r13, [ r12 + rbx ] + + {% for quarter in (0..3) %} + {% if quarter != 0 %} + // move next four rows at top (xmm0,2,..10) + vperm2f128 ymm0, ymm0, ymm1, {{quarter}} + vperm2f128 ymm2, ymm2, ymm3, {{quarter}} + vperm2f128 ymm4, ymm4, ymm5, {{quarter}} + vperm2f128 ymm6, ymm6, ymm7, {{quarter}} + vperm2f128 ymm8, ymm8, ymm9, {{quarter}} + vperm2f128 ymm10, ymm10, ymm11, {{quarter}} + {% endif %} + {% for row in (0..3) %} + {% for i in (0..5) %} + vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | times:2}}, {{row}} + add r{{i | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" type:"f32", size:"16x6", suffix:suffix, G:G, L:L %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_24x4.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_24x4.tmpl new file mode 100644 index 000000000..47b6e24ec --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_24x4.tmpl @@ -0,0 +1,158 @@ +{% comment %} +// vim: set syntax=asm : +/* mmm 24 x 4: + + ymm0 ymm3 ymm6 ymm10 + ymm1 ymm4 ymm7 ymm11 + ymm2 ymm5 ymm8 ymm12 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" type:"f32", size:"24x4", suffix:suffix, G:G %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rbx, rbx + jz {{L}}non_linear_loop + +{{L}}main_loop_packed_packed: + {% include "3x4/packed_packed_loop1/avx.tmpli" %} + + add rcx, 16 + add rax, 96 + dec rbx + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +// NON LINEAR / ADDC + +{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:11, type:"f32" %} +{% include "fma_mmm_f32_per_rows.tmpliq" mr:24, from:0, to:11, type:"f32" %} +{% include "fma_mmm_f32_per_cols.tmpliq" mr:24, from:0, to:11, type:"f32" %} +{% include "fma_mmm_load_tile.tmpliq" from:0, to:11 %} + +{{L}}add_unicast: + + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + cmp rsi, 4 + jne {{L}}unicast_generic + + lea r9, [ r8 + rbx ] + lea r10, [ r9 + rbx] + lea r11, [ r10 + rbx ] + lea r12, [ r11 + rbx ] + +{% for col in (0..3) %} + {% for row in (0..2) %} + vmovups ymm12, [ r{{col|plus:8}} ] + add r{{col|plus:8}}, 32 + vaddps ymm{{col|times:3|plus:row}}, ymm{{col|times:3|plus:row}}, ymm12 + {% endfor %} +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}unicast_generic: + mov eax, 0 +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} + +// mov r12, [0] + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + + lea r9, [ r8 + rsi * 8 ] + lea r10, [ r9 + rsi * 8 ] + +{% for col in (0..3) %} + {% for row in (0..2) %} + vpcmpeqd ymm15, ymm15, ymm15 + vgatherdps ymm12, [ r{{row|plus:8}} + ymm14 ], ymm15 + add r{{row|plus:8}}, rbx + vaddps ymm{{col|times:3|plus:row}}, ymm{{col|times:3|plus:row}}, ymm12 + {% endfor %} +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vmovups ymm12, [rax] + vmovups ymm13, [rax + 32] + vmovups ymm15, [rax + 64] +{% for i in (0..3) %} + vbroadcastss ymm14, dword ptr [rbx + {{i|times:4}} ] + vfmadd231ps ymm{{i|times:3}}, ymm12, ymm14 + vfmadd231ps ymm{{i|times:3|plus:1}}, ymm13, ymm14 + vfmadd231ps ymm{{i|times:3|plus:2}}, ymm15, ymm14 +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r11, [ r10 + rbx ] + + cmp rsi, 4 + jne {{L}}store_strides_generic + + {% for col in (0..3) %} + {% for row in (0..2) %} + vmovups ymmword ptr [r{{col|plus:8}}], ymm{{col|times:3|plus:row}} + add r{{col|plus:8}}, 32 + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store_strides_generic: + {% for col in (0..3) %} + {% for row in (0..2) %} + {% for i in (0..3) %} + vextractps dword ptr [r{{col | plus: 8}}], xmm{{col | times:3 | plus:row}}, {{i}} + add r{{col | plus: 8}}, rsi + {% endfor %} + vperm2f128 ymm{{col | times:3 | plus:row}}, ymm{{col | times:3 | plus:row}}, ymm{{col | times:3 | plus:row}}, 1 + {% for i in (0..3) %} + vextractps dword ptr [r{{col | plus: 8}}], xmm{{col | times:3|plus:row}}, {{i}} + add r{{col | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" type:"f32", size:"24x4", suffix:suffix, G:G, L:L %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x1.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x1.tmpl new file mode 100644 index 000000000..e4c89bd59 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x1.tmpl @@ -0,0 +1,368 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 64 x 1 + + ymm0 + ymm1 + ymm2 + ymm3 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" type:"f32", size:"32x1", suffix:suffix, G:G %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + mov r8, [rdi + 32] // packing + test rbx, rbx + jz {{L}}non_linear_loop + + cmp r8, 1 + jz {{L}}q40f32 + + cmp r8, 2 + jz {{L}}q40f16 + + cmp r8, 3 + jz {{L}}f16f16 + +{{align}} 16 +{{L}}main_loop_packed_packed: + vbroadcastss ymm15, dword ptr [rcx] + + vmovaps ymm8, [rax] + vmovaps ymm9, [rax + 32] + vmovaps ymm10, [rax + 64] + vmovaps ymm11, [rax + 96] + + vfmadd231ps ymm0, ymm15, ymm8 + vfmadd231ps ymm1, ymm15, ymm9 + vfmadd231ps ymm2, ymm15, ymm10 + vfmadd231ps ymm3, ymm15, ymm11 + + add rcx, 4 + add rax, 128 + sub rbx, 1 + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +{% if msvc %} +{{L}}q40f32_mask: + {{long}} 0F0F0F0Fh +{{L}}q40f32_eight: + {{long}} 08h +{% else %} +{{L}}q40f32_mask: + {{long}} 0x0F0F0F0F +{{L}}q40f32_eight: + {{long}} 8 +{% endif %} + +{{L}}q40f32: + // ymm0-3: acc + // ymm4-7: scales + // ymm13: 8 + // ymm14: mask + // ymm15: b value + vbroadcastss ymm14, dword ptr [{{offset}} {{L}}q40f32_mask] + vbroadcastss ymm13, dword ptr [{{offset}} {{L}}q40f32_eight] + +{{L}}q40f32_outerloop: + // scales + vmovaps xmm4, [rax] + vmovaps xmm5, [rax + 16] + vmovaps xmm6, [rax + 32] + vmovaps xmm7, [rax + 48] + vcvtph2ps ymm4, xmm4 + vcvtph2ps ymm5, xmm5 + vcvtph2ps ymm6, xmm6 + vcvtph2ps ymm7, xmm7 + add rax, 64 + + mov rdx, 32 + +{{L}}q40f32_innerloop: + vbroadcastss ymm15, dword ptr [rcx] + vmovaps xmm8, [rax] // 32 nibbles + + vpand xmm10, xmm8, xmm14 // 16 bytes + + vpmovzxbd ymm9, xmm10 // 8 u32 + + vpermilpd xmm10, xmm10, 1 // swap 64bit halves + vpmovzxbd ymm10, xmm10 // 8 u32 + + vpsrlw xmm8, xmm8, 4 + vpand xmm12, xmm8, xmm14 // 16 bytes + vpmovzxbd ymm11, xmm12 // 8 u32 + vpermilpd xmm12, xmm12, 1 // swap 64bit halves + vpmovzxbd ymm12, xmm12 // 8 u32 + + vpsubd ymm9, ymm9, ymm13 + vpsubd ymm10, ymm10, ymm13 + vpsubd ymm11, ymm11, ymm13 + vpsubd ymm12, ymm12, ymm13 + + vcvtdq2ps ymm9, ymm9 + vcvtdq2ps ymm10, ymm10 + vcvtdq2ps ymm11, ymm11 + vcvtdq2ps ymm12, ymm12 + + vmulps ymm9, ymm9, ymm4 + vmulps ymm10, ymm10, ymm5 + vmulps ymm11, ymm11, ymm6 + vmulps ymm12, ymm12, ymm7 + + vfmadd231ps ymm0, ymm15, ymm9 + vfmadd231ps ymm1, ymm15, ymm10 + vfmadd231ps ymm2, ymm15, ymm11 + vfmadd231ps ymm3, ymm15, ymm12 + + add rax, 16 + add rcx, 4 + sub rdx, 1 + jnz {{L}}q40f32_innerloop + + sub rbx, 32 + jnz {{L}}q40f32_outerloop + + jmp {{L}}non_linear_loop + +{{L}}q40f16: + // ymm0-3: acc + // ymm4-7: scales + // ymm13: 8 + // ymm14: mask + // ymm15: b value + vbroadcastss ymm14, dword ptr [{{offset}} {{L}}q40f32_mask] + vbroadcastss ymm13, dword ptr [{{offset}} {{L}}q40f32_eight] + +{{L}}q40f16_outerloop: + // scales + vmovaps xmm4, [rax] + vmovaps xmm5, [rax + 16] + vmovaps xmm6, [rax + 32] + vmovaps xmm7, [rax + 48] + vcvtph2ps ymm4, xmm4 + vcvtph2ps ymm5, xmm5 + vcvtph2ps ymm6, xmm6 + vcvtph2ps ymm7, xmm7 + add rax, 64 + + mov rdx, 32 + +{{L}}q40f16_innerloop: + vpbroadcastw ymm15, word ptr [rcx] + vcvtph2ps ymm15, xmm15 + + vmovaps xmm8, [rax] // 32 nibbles + + vpand xmm10, xmm8, xmm14 // 16 bytes + + vpmovzxbd ymm9, xmm10 // 8 u32 + + vpermilpd xmm10, xmm10, 1 // swap 64bit halves + vpmovzxbd ymm10, xmm10 // 8 u32 + + vpsrlw xmm8, xmm8, 4 + vpand xmm12, xmm8, xmm14 // 16 bytes + vpmovzxbd ymm11, xmm12 // 8 u32 + vpermilpd xmm12, xmm12, 1 // swap 64bit halves + vpmovzxbd ymm12, xmm12 // 8 u32 + + vpsubd ymm9, ymm9, ymm13 + vpsubd ymm10, ymm10, ymm13 + vpsubd ymm11, ymm11, ymm13 + vpsubd ymm12, ymm12, ymm13 + + vcvtdq2ps ymm9, ymm9 + vcvtdq2ps ymm10, ymm10 + vcvtdq2ps ymm11, ymm11 + vcvtdq2ps ymm12, ymm12 + + vmulps ymm9, ymm9, ymm4 + vmulps ymm10, ymm10, ymm5 + vmulps ymm11, ymm11, ymm6 + vmulps ymm12, ymm12, ymm7 + + vfmadd231ps ymm0, ymm15, ymm9 + vfmadd231ps ymm1, ymm15, ymm10 + vfmadd231ps ymm2, ymm15, ymm11 + vfmadd231ps ymm3, ymm15, ymm12 + + add rax, 16 + add rcx, 2 + sub rdx, 1 + jnz {{L}}q40f16_innerloop + + sub rbx, 32 + jnz {{L}}q40f16_outerloop + + jmp {{L}}non_linear_loop + +{{L}}f16f16: +{{align}} 16 + vpbroadcastw ymm15, word ptr [rcx] + + vmovaps xmm4, [rax] + vmovaps xmm5, [rax + 16] + vmovaps xmm6, [rax + 32] + vmovaps xmm7, [rax + 48] + + vcvtph2ps ymm15, xmm15 + vcvtph2ps ymm4, xmm4 + vcvtph2ps ymm5, xmm5 + vcvtph2ps ymm6, xmm6 + vcvtph2ps ymm7, xmm7 + + vfmadd231ps ymm0, ymm15, ymm4 + vfmadd231ps ymm1, ymm15, ymm5 + vfmadd231ps ymm2, ymm15, ymm6 + vfmadd231ps ymm3, ymm15, ymm7 + + add rcx, 2 + add rax, 64 + sub rbx, 1 + jnz {{L}}f16f16 + + jmp {{L}}non_linear_loop + +{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:3, type:"f32" %} +{% include "fma_mmm_f32_per_rows.tmpliq" mr:32, from:0, to:3, type:"f32" %} +{% include "fma_mmm_f32_per_cols.tmpliq" mr:32, from:0, to:3, type:"f32" %} +{% include "fma_mmm_load_tile.tmpliq" from:0, to:3 %} + +{{L}}add_unicast: + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + + cmp rsi, 4 + jne {{L}}add_unicast_generic + + {% for row in (0..3) %} + vaddps ymm{{row}}, ymm{{row}}, [ r10 + {{row|times:32}} ] + {% endfor %} + jmp {{L}}non_linear_loop + + + jmp {{L}}non_linear_loop + +{{L}}add_unicast_generic: + mov eax, 0 +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + +{% for i in (0..3) %} + vpcmpeqd ymm15, ymm15, ymm15 + vgatherdps ymm12, [ r10 + ymm14 ], ymm15 + + vaddps ymm{{i}}, ymm{{i}}, ymm12 + lea r10, [ r10 + rsi * 8 ] +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vbroadcastss ymm14, dword ptr [rbx] + +{% for i in (0..3) %} + vmovups ymm12, [rax + {{i|times:32}}] + vfmadd231ps ymm{{i}}, ymm12, ymm14 +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov r11, [rdi + 32] // item size + + cmp r11, 2 + je {{L}}store_f16 + + cmp rsi, 4 + jne {{L}}store_generic + + {% for row in (0..3) %} + vmovups [r8 + {{row|times:32}}], ymm{{row}} + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store_generic: + + {% for vec in (0..3) %} + {% for half in (0..1) %} + {% if half == 0 %} + movaps xmm9, xmm{{vec}} + {% else %} + vperm2f128 ymm9, ymm{{vec}}, ymm{{vec}}, 1 + {% endif %} + {% for row in (0..3) %} + vextractps dword ptr [r8], xmm9, {{row}} + add r8, rsi + {% endfor %} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store_f16: + + vcvtps2ph xmm0, ymm0, 0 + vcvtps2ph xmm1, ymm1, 0 + vcvtps2ph xmm2, ymm2, 0 + vcvtps2ph xmm3, ymm3, 0 + + cmp rsi, 2 + jne {{L}}store_generic_f16 + + {% for row in (0..3) %} + vmovups [r8 + {{row|times:16}}], xmm{{row}} + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store_generic_f16: + + {% for vec in (0..3) %} + {% for row in (0..7) %} + pextrw word ptr [r8], xmm{{vec}}, {{row}} + add r8, rsi + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" type:"f32", size:"32x1", suffix:suffix, G:G, L:L %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x3.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x3.tmpl new file mode 100644 index 000000000..0675bc6b9 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x3.tmpl @@ -0,0 +1,239 @@ +{% comment %} +// vim: set syntax=asm : +/* mmm 16 x 5: + + ymm0 ymm4 ymm8 + ymm1 ymm5 ymm9 + ymm2 ymm6 ymm10 + ymm3 ymm7 ymm11 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" type:"f32", size:"32x3", suffix:suffix, G:G %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rbx, [rdi + 8] // k + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov r8, [rdi + 32] // packing + + test rbx, rbx + jz {{L}}non_linear_loop + + cmp r8, 1 + jz {{L}}main_loop_packed_packed_f32_f16 + +{{L}}main_loop_packed_packed: + {% include "4x3/packed_packed_loop1/avx.tmpli" %} + + dec rbx + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +{{L}}main_loop_packed_packed_f32_f16: + // Load col of A + vmovaps ymm12, [rax] + + // Fill 3 cols of B + vpbroadcastw xmm13, word ptr [rcx + 0] + vpbroadcastw xmm14, word ptr [rcx + 2] + vpbroadcastw xmm15, word ptr [rcx + 4] + + vcvtph2ps ymm13, xmm13 + vcvtph2ps ymm14, xmm14 + vcvtph2ps ymm15, xmm15 + + // N.B. Stepping cols in inner loop + vfmadd231ps ymm0, ymm12, ymm13 + vfmadd231ps ymm4, ymm12, ymm14 + vfmadd231ps ymm8, ymm12, ymm15 + + vmovaps ymm12, [rax+32] + + vfmadd231ps ymm1, ymm12, ymm13 + vfmadd231ps ymm5, ymm12, ymm14 + vfmadd231ps ymm9, ymm12, ymm15 + + vmovaps ymm12, [rax+64] + + vfmadd231ps ymm2, ymm12, ymm13 + vfmadd231ps ymm6, ymm12, ymm14 + vfmadd231ps ymm10, ymm12, ymm15 + + vmovaps ymm12, [rax+96] + + vfmadd231ps ymm3, ymm12, ymm13 + vfmadd231ps ymm7, ymm12, ymm14 + vfmadd231ps ymm11, ymm12, ymm15 + + add rcx, 6 + add rax, 128 + + dec rbx + jnz {{L}}main_loop_packed_packed_f32_f16 + + jmp {{L}}non_linear_loop + +// NON LINEAR / ADDC + +{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:11, type:"f32" %} +{% include "fma_mmm_f32_per_rows.tmpliq" mr:32, from:0, to:11, type:"f32" %} +{% include "fma_mmm_f32_per_cols.tmpliq" mr:32, from:0, to:11, type:"f32" %} +{% include "fma_mmm_load_tile.tmpliq" from:0, to:11 %} + +{{L}}add_unicast: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + cmp rsi, 4 + jne {{L}}unicast_generic + + lea r9, [ r8 + rbx ] + lea r10, [ r9 + rbx] + lea r11, [ r10 + rbx ] + +{% for col in (0..2) %} + {% for row in (0..3) %} + vmovups ymm12, [ r{{col|plus:8}} ] + add r{{col|plus:8}}, 32 + vaddps ymm{{col|times:4|plus:row}}, ymm{{col|times:4|plus:row}}, ymm12 + {% endfor %} +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}unicast_generic: + mov eax, 0 +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} + +// mov r12, [0] + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + + lea r9, [ r8 + rsi * 8 ] + lea r10, [ r9 + rsi * 8 ] + lea r11, [ r10 + rsi * 8 ] + +{% for col in (0..2) %} + {% for row in (0..3) %} + vpcmpeqd ymm15, ymm15, ymm15 + vgatherdps ymm12, [ r{{row|plus:8}} + ymm14 ], ymm15 + add r{{row|plus:8}}, rbx + vaddps ymm{{col|times:4|plus:row}}, ymm{{col|times:4|plus:row}}, ymm12 + {% endfor %} +{% endfor %} + + jmp {{L}}non_linear_loop + + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vbroadcastss ymm13, dword ptr [rbx] + vbroadcastss ymm14, dword ptr [rbx + 4] + vbroadcastss ymm15, dword ptr [rbx + 8] +{% for i in (0..3) %} + vmovups ymm12, [rax + {{i|times:32}}] + vfmadd231ps ymm{{0|plus:i}}, ymm12, ymm13 + vfmadd231ps ymm{{4|plus:i}}, ymm12, ymm14 + vfmadd231ps ymm{{8|plus:i}}, ymm12, ymm15 +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + mov r11, [rdi + 32] // item size + + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + + cmp r11, 2 + je {{L}}store_f16 + + cmp rsi, 4 + jne {{L}}store_strides_generic + + {% for col in (0..2) %} + {% for row in (0..3) %} + vmovups ymmword ptr [r{{col|plus:8}}], ymm{{col|times:4|plus:row}} + add r{{col|plus:8}}, 32 + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store_strides_generic: + + {% for col in (0..2) %} + {% for row in (0..3) %} + {% for i in (0..3) %} + vextractps dword ptr [r{{col | plus: 8}}], xmm{{col | times:4 | plus:row}}, {{i}} + add r{{col | plus: 8}}, rsi + {% endfor %} + vperm2f128 ymm{{col | times:4 | plus:row}}, ymm{{col | times:4 | plus:row}}, ymm{{col | times:4 | plus:row}}, 1 + {% for i in (0..3) %} + vextractps dword ptr [r{{col | plus: 8}}], xmm{{col | times:4|plus:row}}, {{i}} + add r{{col | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + jmp {{L}}non_linear_loop + +{{L}}store_f16: + + {% for reg in (0..11) %} + vcvtps2ph xmm{{reg}}, ymm{{reg}}, 0 + {% endfor %} + + cmp rsi, 2 + jne {{L}}store_generic_f16 + + {% for col in (0..2) %} + {% for row in (0..3) %} + vmovups [r{{col|plus:8}} + {{row|times:16}}], xmm{{col|times:4|plus:row}} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store_generic_f16: + {% for col in (0..2) %} + {% for vec in (0..3) %} + {% for row in (0..7) %} + pextrw word ptr [r{{col|plus:8}}], xmm{{col|times:4|plus:vec}}, {{row}} + add r{{col|plus:8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" type:"f32", size:"32x3", suffix:suffix, G:G, L:L %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_40x2.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_40x2.tmpl new file mode 100644 index 000000000..81a47ef0c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_40x2.tmpl @@ -0,0 +1,158 @@ +{% comment %} +// vim: set syntax=asm : +/* mmm 40 x 5: + + ymm0 ymm5 + ymm1 ymm6 + ymm2 ymm7 + ymm3 ymm8 + ymm4 ymm9 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" type:"f32", size:"40x2", suffix:suffix, G:G %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rbx, rbx + jz {{L}}non_linear_loop + +{{L}}main_loop_packed_packed: + {% include "5x2/packed_packed_loop1/avx.tmpli" %} + + dec rbx + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +// NON LINEAR / ADDC + +{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:9, type:"f32" %} +{% include "fma_mmm_f32_per_rows.tmpliq" mr:40, from:0, to:9, type:"f32" %} +{% include "fma_mmm_f32_per_cols.tmpliq" mr:40, from:0, to:9, type:"f32" %} +{% include "fma_mmm_load_tile.tmpliq" from:0, to:9 %} + +{{L}}add_unicast: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + cmp rsi, 4 + jne {{L}}unicast_generic + + lea r9, [ r8 + rbx ] + lea r10, [ r9 + rbx] + lea r11, [ r10 + rbx ] + lea r12, [ r11 + rbx ] + + +{% for col in (0..1) %} + {% for row in (0..4) %} + vmovups ymm12, [ r{{col|plus:8}} ] + add r{{col|plus:8}}, 32 + vaddps ymm{{col|times:5|plus:row}}, ymm{{col|times:5|plus:row}}, ymm12 + {% endfor %} +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}unicast_generic: + mov eax, 0 +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + + lea r9, [ r8 + rsi * 8] + lea r10, [ r9 + rsi * 8] + lea r11, [ r10 + rsi * 8] + lea r12, [ r11 + rsi * 8] + +{% for col in (0..1) %} + {% for row in (0..4) %} + vpcmpeqd ymm15, ymm15, ymm15 + vgatherdps ymm12, [ r{{row|plus:8}} + ymm14 ], ymm15 + add r{{row|plus:8}}, rbx + vaddps ymm{{col|times:5|plus:row}}, ymm{{col|times:5|plus:row}}, ymm12 + {% endfor %} +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vbroadcastss ymm10, dword ptr [rbx] + vbroadcastss ymm11, dword ptr [rbx + 4] +{% for i in (0..4) %} + vmovups ymm12, [rax + {{i|times:32}}] + vfmadd231ps ymm{{0|plus:i}}, ymm12, ymm10 + vfmadd231ps ymm{{5|plus:i}}, ymm12, ymm11 +{% endfor %} + jmp {{L}}non_linear_loop + + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r11, [ r10 + rbx ] + lea r12, [ r10 + 2 * rbx ] + + cmp rsi, 4 + jne {{L}}store_strides_generic + + {% for col in (0..1) %} + {% for row in (0..4) %} + vmovups ymmword ptr [r{{col|plus:8}}], ymm{{col|times:5|plus:row}} + add r{{col|plus:8}}, 32 + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store_strides_generic: + {% for col in (0..1) %} + {% for row in (0..4) %} + {% for i in (0..3) %} + vextractps dword ptr [r{{col | plus: 8}}], xmm{{col | times:5 | plus:row}}, {{i}} + add r{{col | plus: 8}}, rsi + {% endfor %} + vperm2f128 ymm{{col | times:5 | plus:row}}, ymm{{col | times:5 | plus:row}}, ymm{{col | times:5 | plus:row}}, 1 + {% for i in (0..3) %} + vextractps dword ptr [r{{col | plus: 8}}], xmm{{col | times:5|plus:row}}, {{i}} + add r{{col | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" type:"f32", size:"40x2", suffix:suffix, G:G, L:L %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_64x1.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_64x1.tmpl new file mode 100644 index 000000000..55b7e59de --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_64x1.tmpl @@ -0,0 +1,142 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 64 x 1 + + ymm0 + ymm1 + ... + ymm8 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" type:"f32", size:"64x1", suffix:suffix, G:G %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rbx, rbx + jz {{L}}non_linear_loop + + test rbx, 1 + jz {{L}}main_loop_packed_packed + {% include "8x1/packed_packed_loop1/avx.tmpli" %} + + dec rbx + jz {{L}}non_linear_loop + +{{align}} 16 +{{L}}main_loop_packed_packed: + {% include "8x1/packed_packed_loop1/avx-unroll.tmpli" %} + + sub rbx, 2 + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:7, type:"f32" %} +{% include "fma_mmm_f32_per_rows.tmpliq" mr:64, from:0, to:7, type:"f32" %} +{% include "fma_mmm_f32_per_cols.tmpliq" mr:64, from:0, to:7, type:"f32" %} +{% include "fma_mmm_load_tile.tmpliq" from:0, to:7 %} + +{{L}}add_unicast: + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + + cmp rsi, 4 + jne {{L}}add_unicast_generic + + {% for row in (0..7) %} + vaddps ymm{{row}}, ymm{{row}}, [ r10 + {{row|times:32}} ] + {% endfor %} + jmp {{L}}non_linear_loop + + + jmp {{L}}non_linear_loop + +{{L}}add_unicast_generic: + mov eax, 0 +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + +{% for i in (0..7) %} + vpcmpeqd ymm15, ymm15, ymm15 + vgatherdps ymm12, [ r10 + ymm14 ], ymm15 + + vaddps ymm{{i}}, ymm{{i}}, ymm12 + lea r10, [ r10 + rsi * 8 ] +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vbroadcastss ymm14, dword ptr [rbx] + +{% for i in (0..7) %} + vmovups ymm12, [rax + {{i|times:32}}] + vfmadd231ps ymm{{i}}, ymm12, ymm14 +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + + cmp rsi, 4 + jne {{L}}store_generic + + {% for row in (0..7) %} + vmovups [r8 + {{row|times:32}}], ymm{{row}} + {% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store_generic: + + {% for vec in (0..7) %} + {% for half in (0..1) %} + {% if half == 0 %} + movaps xmm9, xmm{{vec}} + {% else %} + vperm2f128 ymm9, ymm{{vec}}, ymm{{vec}}, 1 + {% endif %} + {% for row in (0..3) %} + vextractps dword ptr [r8], xmm9, {{row}} + add r8, rsi + {% endfor %} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + + +{% include "postamble.tmpliq" type:"f32", size:"64x1", suffix:suffix, G:G, L:L %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_8x8.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_8x8.tmpl new file mode 100644 index 000000000..681866a78 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_8x8.tmpl @@ -0,0 +1,129 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 16 x 6: + + ymm0 ymm2 ymm4 ymm6 ymm8 ymm10 + ymm1 ymm3 ymm5 ymm7 ymm9 ymm11 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" type:"f32", size:"8x8", suffix:suffix, G:G %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rbx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rcx, [rdi + 8] // k + test rcx, rcx + jz {{L}}non_linear_loop + +{{L}}main_loop_packed_packed: + vmovaps ymm12, [rax] + + {% for i in (0..7) %} + vbroadcastss ymm14, dword ptr [rbx + {{i}} * 4] + vfmadd231ps ymm{{i}}, ymm12, ymm14 + {% endfor %} + + add rax, 32 + add rbx, 32 + dec rcx + jnz {{L}}main_loop_packed_packed + jmp {{L}}non_linear_loop + +// NON LINEAR / ADDC + +{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:7, type:"f32" %} +{% include "fma_mmm_f32_per_rows.tmpliq" mr:8, from:0, to:7, type:"f32" %} +{% include "fma_mmm_f32_per_cols.tmpliq" mr:8, from:0, to:7, type:"f32" %} +{% include "fma_mmm_load_tile.tmpliq" from:0, to:7 %} + +{{L}}add_unicast: + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + mov eax, 0 +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + +{% for i in (0..7) %} + vpcmpeqd ymm15, ymm15, ymm15 + vgatherdps ymm12, [ r10 + ymm14 ], ymm15 + add r10, rbx + vaddps ymm{{i}}, ymm{{i}}, ymm12 +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vmovups ymm12, [rax] + +{% for i in (0..7) %} + vbroadcastss ymm14, dword ptr [rbx + {{i|times:4}} ] + vfmadd231ps ymm{{i}}, ymm12, ymm14 +{% endfor %} + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + // tops of cols + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r12, [ r8 + 4 * rbx ] + lea r11, [ r10 + rbx ] + lea r13, [ r12 + rbx ] + lea r14, [ r12 + 2 * rbx ] + lea r15, [ r13 + 2 * rbx ] + + {% for quarter in (0..1) %} + {% if quarter != 0 %} + // move next four rows at top (xmm0,2,..10) + {% for r in (0..7) %} + vperm2f128 ymm{{r}}, ymm{{r}}, ymm{{r}}, {{quarter}} + {% endfor %} + {% endif %} + {% for row in (0..3) %} + {% for i in (0..7) %} + vextractps dword ptr [r{{i | plus: 8}}], xmm{{i}}, {{row}} + add r{{i | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + + +{% include "postamble.tmpliq" type:"f32", size:"8x8", suffix:suffix, G:G, L:L %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_cols.tmpliq new file mode 100644 index 000000000..c1a2cd487 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_cols.tmpliq @@ -0,0 +1,9 @@ +// vim: set syntax=asm : + +{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_min", op:"vminps", mr:mr, from:from, to:to, type:type%} +{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_max", op:"vmaxps", mr:mr, from:from, to:to, type:type%} +{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_add", op:"vaddps", mr:mr, from:from, to:to, type:type%} +{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_mul", op:"vmulps", mr:mr, from:from, to:to, type:type%} +{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_sub", op:"vsubps", from:from, to:to, type:type %} +{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_sub_flipped", op:"vsubps", from:from, to:to, flipped: true, type:type%} + diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_rows.tmpliq new file mode 100644 index 000000000..9e7a2ddcd --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_rows.tmpliq @@ -0,0 +1,9 @@ +// vim: set syntax=asm : + +{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_min", op:"vminps", mr:mr, from:from, to:to, type: type%} +{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_max", op:"vmaxps", mr:mr, from:from, to:to, type: type%} +{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_add", op:"vaddps", mr:mr, from:from, to:to, type: type%} +{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_mul", op:"vmulps", mr:mr, from:from, to:to, type: type%} +{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_sub", op:"vsubps", from:from, to:to, type: type%} +{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_sub_flipped", op:"vsubps", from:from, to:to, flipped: true, type: type%} + diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_scalars.tmpliq new file mode 100644 index 000000000..a0a4d47d3 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_scalars.tmpliq @@ -0,0 +1,38 @@ +// vim: set syntax=asm : + +{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_min", op:"vminps", from:from, to:to, type:type%} +{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_max", op:"vmaxps", from:from, to:to, type:type%} +{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_add", op:"vaddps", from:from, to:to, type:type%} +{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_mul", op:"vmulps", from:from, to:to, type:type%} +{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_sub", op:"vsubps", from:from, to:to, type:type%} +{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vsubps", from:from, to:to, flipped: true, type:type%} + +{{L}}leaky_relu: + // can only use ymm12 to ymm15 + // ymm15 <- alpha + {% if type == "f32" %} + vbroadcastss ymm15, dword ptr [rdi + 8] + {% else %} + pinsrw xmm15, word ptr [rdi + 8], 0 + vcvtph2ps ymm15, xmm15 + vbroadcastss ymm15, xmm15 + {% endif %} + + // ymm14 <- all zero + vpxor ymm14, ymm14, ymm14 + + {% for reg in (from..to) %} + // ymm12 <- alpha * x + vmulps ymm12, ymm{{reg}}, ymm15 + vcmpps ymm13, ymm14, ymm{{reg}}, 1 // 1 means LT + vblendvps ymm{{reg}}, ymm12, ymm{{reg}}, ymm13 + {% endfor %} + // select muled of orginal + + jmp {{L}}non_linear_loop + +{{L}}q_scale: +{{L}}q_shl: +{{L}}q_shr: + jmp {{L}}unsupported + diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_cols.tmpliq new file mode 100644 index 000000000..387b37920 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_cols.tmpliq @@ -0,0 +1,9 @@ +// vim: set syntax=asm : + +{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_min", op:"vpminsd", mr:mr, from:from, to:to, type:"i32"%} +{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_max", op:"vpmaxsd", mr:mr, from:from, to:to, type:"i32"%} +{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_add", op:"vpaddd", mr:mr, from:from, to:to, type:"i32"%} +{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_mul", op:"vpmulld", mr:mr, from:from, to:to, type:"i32"%} +{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_sub", op:"vpsubd", from:from, to:to, type:"i32"%} +{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_sub_flipped", op:"vpsubd", from:from, to:to, flipped: true, type:"i32"%} + diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_rows.tmpliq new file mode 100644 index 000000000..2b07a15e0 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_rows.tmpliq @@ -0,0 +1,9 @@ +// vim: set syntax=asm : + +{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_min", op:"vpminsd", mr:mr, from:from, to:to, type:"i32"%} +{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_max", op:"vpmaxsd", mr:mr, from:from, to:to, type:"i32"%} +{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_add", op:"vpaddd", mr:mr, from:from, to:to, type:"i32"%} +{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_mul", op:"vpmulld", mr:mr, from:from, to:to, type:"i32"%} +{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_sub", op:"vpsubd", from:from, to:to, type:"i32"%} +{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_sub_flipped", op:"vpsubd", from:from, to:to, flipped: true, type:"i32"%} + diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_scalars.tmpliq new file mode 100644 index 000000000..b522b6948 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_scalars.tmpliq @@ -0,0 +1,23 @@ +// vim: set syntax=asm : + +{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_min", op:"vpminsd", from:from, to:to, type:"i32" %} +{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_max", op:"vpmaxsd", from:from, to:to, type:"i32" %} +{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_mul", op:"vpmulld", from:from, to:to, type:"i32" %} +{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_add", op:"vpaddd", from:from, to:to, type:"i32" %} +{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_sub", op:"vpsubd", from:from, to:to, type:"i32" %} +{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vpsubd", from:from, to:to, flipped: true, type:"i32" %} + +{{L}}leaky_relu: + // can only use ymm12 to ymm15 + // ymm15 <- alpha + vbroadcastss ymm15, dword ptr [rdi + 8] + // ymm14 <- all zero + vpxor ymm14, ymm14, ymm14 + + {% for reg in (from..to) %} + vpmulld ymm12, ymm{{reg}}, ymm15 + vpcmpgtd ymm13, ymm14, ymm{{reg}} + vblendvps ymm{{reg}}, ymm{{reg}}, ymm12, ymm13 + {% endfor %} + + jmp {{L}}non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_load_tile.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_load_tile.tmpliq new file mode 100644 index 000000000..f0d1896b6 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_load_tile.tmpliq @@ -0,0 +1,9 @@ +// vim: set syntax=asm : + +{{L}}load_tile: + mov r8, [rdi + 8] + {% for reg in (from..to) %} + vmovups ymm{{reg}}, ymmword ptr [r8 + {{ reg|minus:from|times:32 }}] + {% endfor %} + + jmp {{L}}non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_col.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_col.tmpliq new file mode 100644 index 000000000..95f72f65c --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_col.tmpliq @@ -0,0 +1,35 @@ +// vim: set syntax=asm : + +{{L}}{{label}}: + mov rax, [ rdi + 8 ] + +{% capture mr_over_8 %}{{ mr | divided_by: 8}}{%endcapture%} +{% capture mr_over_8_min_1 %}{{ mr | divided_by: 8 | minus: 1}}{%endcapture%} + +{%capture tmp%}{{to | plus: 1 }}{%endcapture%} + +{%capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_8}}{%endcapture%} +{%capture cols_min_1%}{{to | plus: 1| minus:from| divided_by:mr_over_8|minus:1}}{%endcapture%} + + +{% for right in (0..cols_min_1) %} + {% if type == "f16" %} + pinsrw xmm{{tmp}}, word ptr [ rax ], 0 + add rax, 2 + vcvtph2ps ymm{{tmp}}, xmm{{tmp}} + vbroadcastss ymm{{tmp}}, xmm{{tmp}} + {% else %} + vbroadcastss ymm{{tmp}}, dword ptr [ rax ] + add rax, 4 + {% endif %} + {% for down in (0..mr_over_8_min_1) %} + {%capture acc%}{{mr_over_8|times:right|plus:from|plus:down}}{%endcapture%} + {% if flipped %} + {{op}} ymm{{acc}}, ymm{{acc}}, ymm{{tmp}} + {% else %} + {{op}} ymm{{acc}}, ymm{{tmp}}, ymm{{acc}} + {% endif %} + {% endfor %} +{% endfor %} + + jmp {{L}}non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_row.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_row.tmpliq new file mode 100644 index 000000000..7366a8ba0 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_row.tmpliq @@ -0,0 +1,32 @@ +// vim: set syntax=asm : + +{{L}}{{label}}: + mov rax, [ rdi + 8 ] + +{% capture mr_over_8 %}{{ mr | divided_by: 8}}{%endcapture%} +{% capture mr_over_8_min_1 %}{{ mr | divided_by: 8 | minus: 1}}{%endcapture%} + +{% if type == "f16" %} + {% for ix in (0..mr_over_8_min_1) %} + vmovups xmm{{to | plus: 1 | plus: ix}}, [rax + {{ix | times: 16}}] + {% endfor %} + {% for ix in (0..mr_over_8_min_1) %} + vcvtph2ps ymm{{to | plus: 1 | plus: ix}}, xmm{{to | plus: 1 | plus: ix}} + {% endfor %} +{% else %} + {% for ix in (0..mr_over_8_min_1) %} + vmovups ymm{{to | plus: 1 | plus: ix}}, [rax + {{ix | times: 32}}] + {% endfor %} +{% endif %} + +{% if flipped %} + {% for acc in (from..to) %} + {{op}} ymm{{acc}}, ymm{{acc}}, ymm{{ acc | modulo: mr_over_8 | plus: to | plus: 1 }} + {% endfor %} +{% else %} + {% for acc in (from..to) %} + {{op}} ymm{{acc}}, ymm{{ acc | modulo: mr_over_8 | plus: to | plus: 1 }}, ymm{{acc}} + {% endfor %} +{% endif %} + + jmp {{L}}non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_scalar.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_scalar.tmpliq new file mode 100644 index 000000000..5ac174965 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_scalar.tmpliq @@ -0,0 +1,22 @@ +// vim: set syntax=asm : + +{{L}}{{label}}: + {% if type == "f16" %} + pinsrw xmm12, word ptr [rdi + 8], 0 + vcvtph2ps ymm12, xmm12 + vbroadcastss ymm12, xmm12 + {% else %} + vbroadcastss ymm12, dword ptr [rdi + 8] + {% endif %} + + {% if flipped %} + {% for reg in (from..to) %} + {{op}} ymm{{reg}}, ymm{{reg}}, ymm12 + {% endfor %} + {% else %} + {% for reg in (from..to) %} + {{op}} ymm{{reg}}, ymm12, ymm{{reg}} + {% endfor %} + {% endif %} + + jmp {{L}}non_linear_loop diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_sigmoid_f32.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_sigmoid_f32.tmpl new file mode 100644 index 000000000..4f650dc10 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_sigmoid_f32.tmpl @@ -0,0 +1,319 @@ +{% comment %} +// vim: set syntax=asm : + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) + +{% endcomment %} + +{% if msvc %} + +_text segment +fma_sigmoid_f32_{{suffix}} proc + +{% else %} + +.intel_syntax noprefix +.text +.p2align 5 +.globl {{G}}fma_sigmoid_f32_{{suffix}} +{{G}}fma_sigmoid_f32_{{suffix}}: +.cfi_startproc +{% endif %} + + push rbp + mov rbp, rsp + + +{% if family == "windows" %} +// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch +// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers + and rsp,-16 + lea rsp,[rsp-160] + vmovaps [rsp], xmm6 + vmovaps [rsp+16*1],xmm7 + vmovaps [rsp+16*2],xmm8 + vmovaps [rsp+16*3],xmm9 + vmovaps [rsp+16*4],xmm10 + vmovaps [rsp+16*5],xmm11 + vmovaps [rsp+16*6],xmm12 + vmovaps [rsp+16*7],xmm13 + vmovaps [rsp+16*8],xmm14 + vmovaps [rsp+16*9],xmm15 + + // move around arguments to mimick SysV rdi,rsi passing + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + +{% endif %} + + push rbx + push r12 + push r13 + push r14 + push r15 + + sub rsp, 8 + +{% if family == "unix" %} +// FIXME +// .cfi_def_cfa_offset 64 +{% endif %} + + stmxcsr [rsp + 4] +{% if msvc %} + mov rax, 1FC0h +{% else %} + mov rax, 0x1FC0 +{% endif %} + mov [rsp], eax + ldmxcsr [rsp] +// ---------------------------------------------------------------------- + + cmp rsi, 0 + je {{L}}done + + cmp rsi, 32 + jl {{L}}loop_1 + +{{L}}loop_4: + + vmovaps ymm4, [rdi] + vmovaps ymm5, [rdi + 32] + vmovaps ymm6, [rdi + 64] + vmovaps ymm7, [rdi + 96] + + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_low] + vbroadcastss ymm1, dword ptr [{{offset}} {{L}}coeffs_num_high] + vbroadcastss ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_13] + vbroadcastss ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_11] + + vmaxps ymm4, ymm4, ymm0 + vmaxps ymm5, ymm5, ymm0 + vmaxps ymm6, ymm6, ymm0 + vmaxps ymm7, ymm7, ymm0 + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9] + + vminps ymm4, ymm4, ymm1 + vminps ymm5, ymm5, ymm1 + vminps ymm6, ymm6, ymm1 + vminps ymm7, ymm7, ymm1 // ymm4..7 <- x + vbroadcastss ymm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7] + + vmulps ymm8, ymm4, ymm4 + vmulps ymm9, ymm5, ymm5 + vmulps ymm10, ymm6, ymm6 + vmulps ymm11, ymm7, ymm7 // ymm8..11 <- x^2 + + vmovaps ymm12, ymm2 + vmovaps ymm13, ymm2 + vmovaps ymm14, ymm2 + vmovaps ymm15, ymm2 + vbroadcastss ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5] + vfmadd132ps ymm12, ymm3, ymm8 + vfmadd132ps ymm13, ymm3, ymm9 + vfmadd132ps ymm14, ymm3, ymm10 + vfmadd132ps ymm15, ymm3, ymm11 + vbroadcastss ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3] + vfmadd132ps ymm12, ymm0, ymm8 + vfmadd132ps ymm13, ymm0, ymm9 + vfmadd132ps ymm14, ymm0, ymm10 + vfmadd132ps ymm15, ymm0, ymm11 + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1] + vfmadd132ps ymm12, ymm1, ymm8 + vfmadd132ps ymm13, ymm1, ymm9 + vfmadd132ps ymm14, ymm1, ymm10 + vfmadd132ps ymm15, ymm1, ymm11 + vbroadcastss ymm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6] + vfmadd132ps ymm12, ymm2, ymm8 + vfmadd132ps ymm13, ymm2, ymm9 + vfmadd132ps ymm14, ymm2, ymm10 + vfmadd132ps ymm15, ymm2, ymm11 + vbroadcastss ymm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4] + vfmadd132ps ymm12, ymm3, ymm8 + vfmadd132ps ymm13, ymm3, ymm9 + vfmadd132ps ymm14, ymm3, ymm10 + vfmadd132ps ymm15, ymm3, ymm11 + vbroadcastss ymm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2] + vfmadd132ps ymm12, ymm0, ymm8 + vfmadd132ps ymm13, ymm0, ymm9 + vfmadd132ps ymm14, ymm0, ymm10 + vfmadd132ps ymm15, ymm0, ymm11 + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0] + vmulps ymm4, ymm4, ymm12 + vmulps ymm5, ymm5, ymm13 + vmulps ymm6, ymm6, ymm14 + vmulps ymm7, ymm7, ymm15 // ymm4..7 <- num + + vmovaps ymm12, ymm1 + vmovaps ymm13, ymm1 + vmovaps ymm14, ymm1 + vmovaps ymm15, ymm1 + + vbroadcastss ymm1, dword ptr [{{offset}} {{L}}coeffs_num_half] + vfmadd132ps ymm12, ymm2, ymm8 + vfmadd132ps ymm13, ymm2, ymm9 + vfmadd132ps ymm14, ymm2, ymm10 + vfmadd132ps ymm15, ymm2, ymm11 + vfmadd132ps ymm12, ymm3, ymm8 + vfmadd132ps ymm13, ymm3, ymm9 + vfmadd132ps ymm14, ymm3, ymm10 + vfmadd132ps ymm15, ymm3, ymm11 + vfmadd132ps ymm12, ymm0, ymm8 + vfmadd132ps ymm13, ymm0, ymm9 + vfmadd132ps ymm14, ymm0, ymm10 + vfmadd132ps ymm15, ymm0, ymm11 // ymm12..14 <- denum + + vdivps ymm4, ymm4, ymm12 + vdivps ymm5, ymm5, ymm13 + vdivps ymm6, ymm6, ymm14 + vdivps ymm7, ymm7, ymm15 + vaddps ymm4, ymm4, ymm1 + vaddps ymm5, ymm5, ymm1 + vaddps ymm6, ymm6, ymm1 + vaddps ymm7, ymm7, ymm1 + + vmovaps [rdi], ymm4 + vmovaps [rdi + 32], ymm5 + vmovaps [rdi + 64], ymm6 + vmovaps [rdi + 96], ymm7 + + add rdi, 128 + sub rsi, 32 + cmp rsi, 32 + jg {{L}}loop_4 + + cmp rsi, 0 + je {{L}}done + +{{L}}loop_1: + vmovaps ymm4, [rdi] + + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_low] + vbroadcastss ymm1, dword ptr [{{offset}} {{L}}coeffs_num_high] + vbroadcastss ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_13] + vbroadcastss ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_11] + + vmaxps ymm4, ymm4, ymm0 + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9] + + vminps ymm4, ymm4, ymm1 // ymm4 <- x + vbroadcastss ymm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7] + + vmulps ymm8, ymm4, ymm4 // ymm8 <- x^2 + + vmovaps ymm12, ymm2 + vbroadcastss ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5] + vfmadd132ps ymm12, ymm3, ymm8 + vbroadcastss ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3] + vfmadd132ps ymm12, ymm0, ymm8 + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1] + vfmadd132ps ymm12, ymm1, ymm8 + vbroadcastss ymm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6] + vfmadd132ps ymm12, ymm2, ymm8 + vbroadcastss ymm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4] + vfmadd132ps ymm12, ymm3, ymm8 + vbroadcastss ymm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2] + vfmadd132ps ymm12, ymm0, ymm8 + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0] + vmulps ymm4, ymm4, ymm12 + + vmovaps ymm12, ymm1 + vbroadcastss ymm1, dword ptr [{{offset}} {{L}}coeffs_num_half] + vfmadd132ps ymm12, ymm2, ymm8 + vfmadd132ps ymm12, ymm3, ymm8 + vfmadd132ps ymm12, ymm0, ymm8 + + vdivps ymm4, ymm4, ymm12 + vaddps ymm4, ymm4, ymm1 + + vmovaps [rdi], ymm4 + add rdi, 32 + sub rsi, 8 + jnz {{L}}loop_1 +{{L}}done: + +// ---------------------------------------------------------------------- + + ldmxcsr [rsp + 4] + + add rsp, 8 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + +{% if family == "windows" %} + pop rsi + pop rdi + + vmovaps xmm15, [rsp+16*9] + vmovaps xmm14, [rsp+16*8] + vmovaps xmm13, [rsp+16*7] + vmovaps xmm12, [rsp+16*6] + vmovaps xmm11, [rsp+16*5] + vmovaps xmm10, [rsp+16*4] + vmovaps xmm9, [rsp+16*3] + vmovaps xmm8, [rsp+16*2] + vmovaps xmm7, [rsp+16*1] + vmovaps xmm6, [rsp] +{% endif %} + + mov rsp, rbp + pop rbp + ret + +{%capture float%}{% if msvc %} real4 {%else%} .float {%endif%}{%endcapture%} + +{{L}}coeffs_num_low: + {{float}} -18.6 // low +{{L}}coeffs_num_high: + {{float}} 18.6 // high + +{{L}}coeffs_num_alpha_13: + {{float}} -4.433153405e-18 +{{L}}coeffs_num_alpha_11: + {{float}} 1.169974371e-14 +{{L}}coeffs_num_alpha_9: + {{float}} -1.875289645e-11 +{{L}}coeffs_num_alpha_7: + {{float}} 4.257889523e-8 +{{L}}coeffs_num_alpha_5: + {{float}} 0.00004811817576 +{{L}}coeffs_num_alpha_3: + {{float}} 0.008163842030 +{{L}}coeffs_num_alpha_1: + {{float}} 0.2499999971 + +{{L}}coeffs_num_beta_6: + {{float}} 3.922935744e-6 +{{L}}coeffs_num_beta_4: + {{float}} 0.001524872358 +{{L}}coeffs_num_beta_2: + {{float}} 0.1159886749 +{{L}}coeffs_num_beta_0: + {{float}} 1.0; + +{{L}}coeffs_num_half: + {{float}} 0.5 + +{% if msvc %} +fma_sigmoid_f32_{{suffix}} endp +_text ends +end +{% else %} +.cfi_endproc +{% endif %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_tanh_f32.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_tanh_f32.tmpl new file mode 100644 index 000000000..7b3c64046 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_tanh_f32.tmpl @@ -0,0 +1,313 @@ +{% comment %} +// vim: set syntax=asm : + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) + +{% endcomment %} + +{% if msvc %} + +_text segment +fma_tanh_f32_{{suffix}} proc + +{% else %} + +.intel_syntax noprefix +.text +.p2align 5 +.globl {{G}}fma_tanh_f32_{{suffix}} +{{G}}fma_tanh_f32_{{suffix}}: +.cfi_startproc +{% endif %} + + push rbp + mov rbp, rsp + + +{% if family == "windows" %} +// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch +// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers + and rsp,-16 + lea rsp,[rsp-160] + vmovaps [rsp], xmm6 + vmovaps [rsp+16*1],xmm7 + vmovaps [rsp+16*2],xmm8 + vmovaps [rsp+16*3],xmm9 + vmovaps [rsp+16*4],xmm10 + vmovaps [rsp+16*5],xmm11 + vmovaps [rsp+16*6],xmm12 + vmovaps [rsp+16*7],xmm13 + vmovaps [rsp+16*8],xmm14 + vmovaps [rsp+16*9],xmm15 + + // move around arguments to mimick SysV rdi,rsi passing + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + +{% endif %} + + push rbx + push r12 + push r13 + push r14 + push r15 + + sub rsp, 8 + +{% if family == "unix" %} +// FIXME +// .cfi_def_cfa_offset 64 +{% endif %} + + stmxcsr [rsp + 4] +{% if msvc %} + mov rax, 1FC0h +{% else %} + mov rax, 0x1FC0 +{% endif %} + mov [rsp], eax + ldmxcsr [rsp] +// ---------------------------------------------------------------------- + +{%capture offset%}{% if msvc %} offset {%else%} rip + {%endif%} {%endcapture%} + + cmp rsi, 0 + je {{L}}done + + cmp rsi, 32 + jl {{L}}loop_1 + +{{L}}loop_4: + + vmovaps ymm4, [rdi] + vmovaps ymm5, [rdi + 32] + vmovaps ymm6, [rdi + 64] + vmovaps ymm7, [rdi + 96] + + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_low] + vbroadcastss ymm1, dword ptr [{{offset}} {{L}}coeffs_num_high] + vbroadcastss ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_13] + vbroadcastss ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_11] + + vmaxps ymm4, ymm4, ymm0 + vmaxps ymm5, ymm5, ymm0 + vmaxps ymm6, ymm6, ymm0 + vmaxps ymm7, ymm7, ymm0 + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9] + + vminps ymm4, ymm4, ymm1 + vminps ymm5, ymm5, ymm1 + vminps ymm6, ymm6, ymm1 + vminps ymm7, ymm7, ymm1 // ymm4..7 <- x + vbroadcastss ymm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7] + + vmulps ymm8, ymm4, ymm4 + vmulps ymm9, ymm5, ymm5 + vmulps ymm10, ymm6, ymm6 + vmulps ymm11, ymm7, ymm7 // ymm8..11 <- x^2 + + vmovaps ymm12, ymm2 + vmovaps ymm13, ymm2 + vmovaps ymm14, ymm2 + vmovaps ymm15, ymm2 + vbroadcastss ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5] + vfmadd132ps ymm12, ymm3, ymm8 + vfmadd132ps ymm13, ymm3, ymm9 + vfmadd132ps ymm14, ymm3, ymm10 + vfmadd132ps ymm15, ymm3, ymm11 + vbroadcastss ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3] + vfmadd132ps ymm12, ymm0, ymm8 + vfmadd132ps ymm13, ymm0, ymm9 + vfmadd132ps ymm14, ymm0, ymm10 + vfmadd132ps ymm15, ymm0, ymm11 + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1] + vfmadd132ps ymm12, ymm1, ymm8 + vfmadd132ps ymm13, ymm1, ymm9 + vfmadd132ps ymm14, ymm1, ymm10 + vfmadd132ps ymm15, ymm1, ymm11 + vbroadcastss ymm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6] + vfmadd132ps ymm12, ymm2, ymm8 + vfmadd132ps ymm13, ymm2, ymm9 + vfmadd132ps ymm14, ymm2, ymm10 + vfmadd132ps ymm15, ymm2, ymm11 + vbroadcastss ymm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4] + vfmadd132ps ymm12, ymm3, ymm8 + vfmadd132ps ymm13, ymm3, ymm9 + vfmadd132ps ymm14, ymm3, ymm10 + vfmadd132ps ymm15, ymm3, ymm11 + vbroadcastss ymm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2] + vfmadd132ps ymm12, ymm0, ymm8 + vfmadd132ps ymm13, ymm0, ymm9 + vfmadd132ps ymm14, ymm0, ymm10 + vfmadd132ps ymm15, ymm0, ymm11 + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0] + vmulps ymm4, ymm4, ymm12 + vmulps ymm5, ymm5, ymm13 + vmulps ymm6, ymm6, ymm14 + vmulps ymm7, ymm7, ymm15 // ymm4..7 <- num + + vmovaps ymm12, ymm1 + vmovaps ymm13, ymm1 + vmovaps ymm14, ymm1 + vmovaps ymm15, ymm1 + vfmadd132ps ymm12, ymm2, ymm8 + vfmadd132ps ymm13, ymm2, ymm9 + vfmadd132ps ymm14, ymm2, ymm10 + vfmadd132ps ymm15, ymm2, ymm11 + vfmadd132ps ymm12, ymm3, ymm8 + vfmadd132ps ymm13, ymm3, ymm9 + vfmadd132ps ymm14, ymm3, ymm10 + vfmadd132ps ymm15, ymm3, ymm11 + vfmadd132ps ymm12, ymm0, ymm8 + vfmadd132ps ymm13, ymm0, ymm9 + vfmadd132ps ymm14, ymm0, ymm10 + vfmadd132ps ymm15, ymm0, ymm11 // ymm12..14 <- denum + + vdivps ymm4, ymm4, ymm12 + vdivps ymm5, ymm5, ymm13 + vdivps ymm6, ymm6, ymm14 + vdivps ymm7, ymm7, ymm15 + + vmovaps [rdi], ymm4 + vmovaps [rdi + 32], ymm5 + vmovaps [rdi + 64], ymm6 + vmovaps [rdi + 96], ymm7 + + add rdi, 128 + sub rsi, 32 + cmp rsi, 32 + jg {{L}}loop_4 + + cmp rsi, 0 + je {{L}}done + +{{L}}loop_1: + vmovaps ymm4, [rdi] + + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_low] + vbroadcastss ymm1, dword ptr [{{offset}} {{L}}coeffs_num_high] + vbroadcastss ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_13] + vbroadcastss ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_11] + + vmaxps ymm4, ymm4, ymm0 + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9] + + vminps ymm4, ymm4, ymm1 // ymm4 <- x + vbroadcastss ymm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7] + + vmulps ymm8, ymm4, ymm4 // ymm8 <- x^2 + + vmovaps ymm12, ymm2 + vbroadcastss ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5] + vfmadd132ps ymm12, ymm3, ymm8 + vbroadcastss ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3] + vfmadd132ps ymm12, ymm0, ymm8 + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1] + vfmadd132ps ymm12, ymm1, ymm8 + vbroadcastss ymm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6] + vfmadd132ps ymm12, ymm2, ymm8 + vbroadcastss ymm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4] + vfmadd132ps ymm12, ymm3, ymm8 + vbroadcastss ymm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2] + vfmadd132ps ymm12, ymm0, ymm8 + vbroadcastss ymm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0] + vmulps ymm4, ymm4, ymm12 + + vmovaps ymm12, ymm1 + vfmadd132ps ymm12, ymm2, ymm8 + vfmadd132ps ymm12, ymm3, ymm8 + vfmadd132ps ymm12, ymm0, ymm8 + + vdivps ymm4, ymm4, ymm12 + + vmovaps [rdi], ymm4 + add rdi, 32 + sub rsi, 8 + jnz {{L}}loop_1 + +{{L}}done: + +// ---------------------------------------------------------------------- + + ldmxcsr [rsp + 4] + + add rsp, 8 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + +{% if family == "windows" %} + pop rsi + pop rdi + + vmovaps xmm15, [rsp+16*9] + vmovaps xmm14, [rsp+16*8] + vmovaps xmm13, [rsp+16*7] + vmovaps xmm12, [rsp+16*6] + vmovaps xmm11, [rsp+16*5] + vmovaps xmm10, [rsp+16*4] + vmovaps xmm9, [rsp+16*3] + vmovaps xmm8, [rsp+16*2] + vmovaps xmm7, [rsp+16*1] + vmovaps xmm6, [rsp] +{% endif %} + + mov rsp, rbp + pop rbp + ret + +{%capture float%}{% if msvc %} real4 {%else%} .float {%endif%}{%endcapture%} + +{{L}}coeffs_num_low: + {{float}} -8.9 +{{L}}coeffs_num_high: + {{float}} 8.9 + +{{L}}coeffs_num_alpha_13: + {{float}} -8.488492677e-14 +{{L}}coeffs_num_alpha_11: + {{float}} 5.277853000e-11 +{{L}}coeffs_num_alpha_9: + {{float}} -2.022500419e-8 +{{L}}coeffs_num_alpha_7: + {{float}} 0.00001115424833 +{{L}}coeffs_num_alpha_5: + {{float}} 0.003103950131 +{{L}}coeffs_num_alpha_3: + {{float}} 0.1308400453 +{{L}}coeffs_num_alpha_1: + {{float}} 0.9999999934 + +{{L}}coeffs_num_beta_6: + {{float}} 0.0002546136580 +{{L}}coeffs_num_beta_4: + {{float}} 0.02449515379 +{{L}}coeffs_num_beta_2: + {{float}} 0.4641733162 +{{L}}coeffs_num_beta_0: + {{float}} 1.0 + + + +{% if msvc %} +fma_tanh_f32_{{suffix}} endp +_text ends +end +{% else %} +.cfi_endproc +{% endif %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/postamble.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/postamble.tmpliq new file mode 100644 index 000000000..616a98975 --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/postamble.tmpliq @@ -0,0 +1,38 @@ +{{L}}return: + ldmxcsr [rsp + 4] + add rsp, 8 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + +{% if family == "windows" %} + pop rsi + pop rdi + + vmovaps xmm15, [rsp+16*9] + vmovaps xmm14, [rsp+16*8] + vmovaps xmm13, [rsp+16*7] + vmovaps xmm12, [rsp+16*6] + vmovaps xmm11, [rsp+16*5] + vmovaps xmm10, [rsp+16*4] + vmovaps xmm9, [rsp+16*3] + vmovaps xmm8, [rsp+16*2] + vmovaps xmm7, [rsp+16*1] + vmovaps xmm6, [rsp] +{% endif %} + + mov rsp, rbp + pop rbp + ret + +{% if msvc %} +fma_mmm_{{type}}_{{size}}_{{suffix}} endp +_text ends +end + +{% else %} +.cfi_endproc +{% endif %} diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/preamble.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/preamble.tmpliq new file mode 100644 index 000000000..f2fbea64b --- /dev/null +++ b/vendor/tract-linalg-0.22.1/x86_64/fma/preamble.tmpliq @@ -0,0 +1,64 @@ + +{% if msvc %} + +_text segment +fma_mmm_{{type}}_{{size}}_{{suffix}} proc + +{% else %} + +.intel_syntax noprefix +.text +.p2align 5 +.globl {{G}}fma_mmm_{{type}}_{{size}}_{{suffix}} +{{G}}fma_mmm_{{type}}_{{size}}_{{suffix}}: +.cfi_startproc + +{% endif %} + + push rbp + mov rbp, rsp + +{% if family == "windows" %} +// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch +// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers + and rsp,-16 + lea rsp,[rsp-160] + vmovaps [rsp], xmm6 + vmovaps [rsp+16*1],xmm7 + vmovaps [rsp+16*2],xmm8 + vmovaps [rsp+16*3],xmm9 + vmovaps [rsp+16*4],xmm10 + vmovaps [rsp+16*5],xmm11 + vmovaps [rsp+16*6],xmm12 + vmovaps [rsp+16*7],xmm13 + vmovaps [rsp+16*8],xmm14 + vmovaps [rsp+16*9],xmm15 + + push rdi + push rsi + + mov rdi, rcx + +{% endif %} + + push rbx + push r12 + push r13 + push r14 + push r15 + + sub rsp, 8 + +{% if family == "unix" %} +.cfi_def_cfa_offset 64 +{% endif %} + stmxcsr [rsp + 4] +{% if msvc %} + mov rax, 1FC0h +{% else %} + mov rax, 0x1FC0 +{% endif %} + mov [rsp], eax + ldmxcsr [rsp] + +{% include "dispatcher.tmpliq" %} From a7fa8dea27830b2f1a0425a2eb088cb415035357 Mon Sep 17 00:00:00 2001 From: czoli1976 <64466170+czoli1976@users.noreply.github.com> Date: Sun, 3 May 2026 16:19:28 +0100 Subject: [PATCH 07/10] perf(wasm): SIMD-vectorize compute_band_corr inner loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hot loop on the per-frame ERB feature path: dot-product over a band of Complex32 against itself (or a reference). The wasm32 build with `+simd128` was leaving this loop scalar — `wasm-objdump` shows zero v128 ops for the function body in the production build. Replace the inner accumulator with a 4-wide f32x4 reduction using `core::arch::wasm32` intrinsics. Output is bit-exact identical (FNV-1a 20ea4579c427f925 unchanged across Chromium / WebKit / Firefox, single-threaded and 4-thread). Same-machine focused bench, Chromium, 5-run alternated, 300 iter × 20 frames per measurement (t-test): vanilla_mono control: 3.755 -> 3.750 ms (no change, sanity) my_mt_1t: 3.748 -> 3.723 ms (-0.67%, t=2.22) my_mt_4t: 4.679 -> 4.646 ms (-0.71%, t=2.45) Native builds use the existing scalar reduction via cfg gating; no behaviour change off wasm32. --- libDF/src/lib.rs | 75 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 7 deletions(-) diff --git a/libDF/src/lib.rs b/libDF/src/lib.rs index 7ab568856..5f3593504 100644 --- a/libDF/src/lib.rs +++ b/libDF/src/lib.rs @@ -282,18 +282,79 @@ pub fn compute_band_corr(out: &mut [f32], x: &[Complex32], p: &[Complex32], erb_ *y = 0.0; } debug_assert_eq!(erb_fb.len(), out.len()); - - let mut bcsum = 0; + debug_assert_eq!(x.len(), p.len()); + + // Each Complex32 occupies 2 contiguous f32 (re, im). Reinterpret the slices + // as flat &[f32] of length 2*N so we can vectorize with f32x4 loads. + // SAFETY: Complex32 is #[repr(C)] { re: f32, im: f32 } -> 8 bytes, alignment 4, + // identical to two contiguous f32. Length is exactly 2 * x.len(). + let xf: &[f32] = + unsafe { core::slice::from_raw_parts(x.as_ptr() as *const f32, x.len() * 2) }; + let pf: &[f32] = + unsafe { core::slice::from_raw_parts(p.as_ptr() as *const f32, p.len() * 2) }; + + let mut bcsum = 0usize; for (&band_size, out_b) in erb_fb.iter().zip(out.iter_mut()) { - let k = 1. / band_size as f32; - for j in 0..band_size { - let idx = bcsum + j; - *out_b += (x[idx].re * p[idx].re + x[idx].im * p[idx].im) * k; - } + let k = 1.0f32 / band_size as f32; + let f_start = bcsum * 2; + let f_len = band_size * 2; + let xb = &xf[f_start..f_start + f_len]; + let pb = &pf[f_start..f_start + f_len]; + // sum := sum over band of x[i].re*p[i].re + x[i].im*p[i].im + // == sum over flattened pairs of xb[2j]*pb[2j] + xb[2j+1]*pb[2j+1] + // == sum_lanes( sum over 4-wide chunks of xb[..]*pb[..] ) + let sum: f32 = compute_band_corr_inner(xb, pb); + *out_b = sum * k; bcsum += band_size; } } +#[cfg(target_arch = "wasm32")] +#[inline] +fn compute_band_corr_inner(xb: &[f32], pb: &[f32]) -> f32 { + use core::arch::wasm32::*; + debug_assert_eq!(xb.len(), pb.len()); + let n = xb.len(); + let n4 = n & !3; // round down to multiple of 4 + let mut acc = f32x4_splat(0.0); + let xp = xb.as_ptr(); + let pp = pb.as_ptr(); + let mut i = 0usize; + while i < n4 { + // SAFETY: xp/pp are aligned to f32 (4 bytes); v128_load uses unaligned semantics. + // We bounds-check via i < n4 <= n == xb.len() == pb.len(). + unsafe { + let xv = v128_load(xp.add(i) as *const v128); + let pv = v128_load(pp.add(i) as *const v128); + let prod = f32x4_mul(xv, pv); + acc = f32x4_add(acc, prod); + } + i += 4; + } + // Horizontal reduce the 4 lanes. + let mut sum = f32x4_extract_lane::<0>(acc) + + f32x4_extract_lane::<1>(acc) + + f32x4_extract_lane::<2>(acc) + + f32x4_extract_lane::<3>(acc); + // Tail: 0..3 leftover f32 (i.e. 0 or 1 trailing complex pair if band_size is odd). + while i < n { + sum += unsafe { *xp.add(i) * *pp.add(i) }; + i += 1; + } + sum +} + +#[cfg(not(target_arch = "wasm32"))] +#[inline] +fn compute_band_corr_inner(xb: &[f32], pb: &[f32]) -> f32 { + debug_assert_eq!(xb.len(), pb.len()); + let mut sum = 0.0f32; + for (a, b) in xb.iter().zip(pb.iter()) { + sum += a * b; + } + sum +} + pub fn band_compr(out: &mut [f32], x: &[f32], erb_fb: &[usize]) { for y in out.iter_mut() { *y = 0.0; From 4895b00117bf25ffed5e7d35d6e72d368ad3d6f9 Mon Sep 17 00:00:00 2001 From: Ckristian Zoli Date: Sun, 3 May 2026 21:35:37 +0100 Subject: [PATCH 08/10] perf(wasm): SIMD-vectorize 3 more inference DSP loops Adds f32x4 vectorization for three more hot DSP functions in the df_process_frame inference path, on top of the compute_band_corr work in this PR's first commit: * band_mean_norm_erb (called from feat_erb per frame): per-bin IIR mean-norm. State is per-bin (no recurrence between bins) so straightforward 4-wide SIMD over all ERB bins. * apply_band_gain (called from apply_mask post-network): Complex32 x f32 scalar mul-in-place per ERB band. Reinterprets &mut [Complex32] as &mut [f32] of length 2N (Complex32 is #[repr(C)] {re, im}, identical layout). 4-wide SIMD multiplies. Also redirects DFState::apply_mask to call apply_band_gain (the Complex32 specialisation) instead of the generic apply_interp_band_gain, since the existing apply_band_gain function is already structurally identical. * apply_window_in_place (called from frame_synthesis per frame): f32 mul-in-place. Signature changed from generic IntoIterator to &[f32] (the sole caller already passes &state.window which IS a slice). 4-wide SIMD multiplies. Each function keeps the original scalar implementation as the non-wasm32 fallback via #[cfg(not(target_arch = "wasm32"))]. Bit-identical output verified: FNV-1a hash of df_process_frame output stream over 3000 random frames matches the Rikorose main baseline exactly across all 3 independent bench runs on Node v20.11.1 / V8. Wasm size delta vs baseline: +835 bytes total (compute_band_corr +699; the 3 new helpers add net +136 bytes). Co-Authored-By: Claude Opus 4.7 (1M context) --- libDF/src/lib.rs | 156 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 137 insertions(+), 19 deletions(-) diff --git a/libDF/src/lib.rs b/libDF/src/lib.rs index 5f3593504..357e318e5 100644 --- a/libDF/src/lib.rs +++ b/libDF/src/lib.rs @@ -221,7 +221,9 @@ impl DFState { } pub fn apply_mask(&self, output: &mut [Complex32], gains: &[f32]) { - apply_interp_band_gain(output, gains, &self.erb) + // apply_band_gain is the Complex32 specialisation of apply_interp_band_gain + // and carries a SIMD-vectorised inner loop on wasm32. + apply_band_gain(output, gains, &self.erb) } } @@ -243,11 +245,7 @@ pub fn band_mean_norm_freq(xs: &[Complex32], xout: &mut [f32], state: &mut [f32] pub fn band_mean_norm_erb(xs: &mut [f32], state: &mut [f32], alpha: f32) { debug_assert_eq!(xs.len(), state.len()); - for (x, s) in xs.iter_mut().zip(state.iter_mut()) { - *s = *x * (1. - alpha) + *s * alpha; - *x -= *s; - *x /= 40.; - } + band_mean_norm_erb_inner(xs, state, alpha); } pub fn band_unit_norm(xs: &mut [Complex32], state: &mut [f32], alpha: f32) { @@ -355,6 +353,124 @@ fn compute_band_corr_inner(xb: &[f32], pb: &[f32]) -> f32 { sum } +// Element-wise IIR mean-norm: state[i] = x[i]*(1-α) + state[i]*α; x[i] = (x[i] - state[i])/40. +// Per-bin independent (no recurrence between bins) — straightforward SIMD. +#[cfg(target_arch = "wasm32")] +#[inline] +fn band_mean_norm_erb_inner(xs: &mut [f32], state: &mut [f32], alpha: f32) { + use core::arch::wasm32::*; + debug_assert_eq!(xs.len(), state.len()); + let n = xs.len(); + let n4 = n & !3; + let one_minus_a = f32x4_splat(1.0 - alpha); + let alpha_v = f32x4_splat(alpha); + let inv40 = f32x4_splat(1.0 / 40.0); + let xp = xs.as_mut_ptr(); + let sp = state.as_mut_ptr(); + let mut i = 0usize; + while i < n4 { + // SAFETY: i < n4 <= n == xs.len() == state.len(). v128_load takes 16 bytes + // (4 f32). xp/sp are aligned to f32 (4 bytes); v128_load uses unaligned semantics. + unsafe { + let xv = v128_load(xp.add(i) as *const v128); + let sv = v128_load(sp.add(i) as *const v128); + let new_s = f32x4_add(f32x4_mul(xv, one_minus_a), f32x4_mul(sv, alpha_v)); + v128_store(sp.add(i) as *mut v128, new_s); + let x_norm = f32x4_mul(f32x4_sub(xv, new_s), inv40); + v128_store(xp.add(i) as *mut v128, x_norm); + } + i += 4; + } + while i < n { + unsafe { + let new_s = *xp.add(i) * (1.0 - alpha) + *sp.add(i) * alpha; + *sp.add(i) = new_s; + *xp.add(i) = (*xp.add(i) - new_s) / 40.0; + } + i += 1; + } +} + +#[cfg(not(target_arch = "wasm32"))] +#[inline] +fn band_mean_norm_erb_inner(xs: &mut [f32], state: &mut [f32], alpha: f32) { + debug_assert_eq!(xs.len(), state.len()); + for (x, s) in xs.iter_mut().zip(state.iter_mut()) { + *s = *x * (1. - alpha) + *s * alpha; + *x -= *s; + *x /= 40.; + } +} + +// Multiply every f32 lane in `xs` by scalar `k`, in place. +#[cfg(target_arch = "wasm32")] +#[inline] +fn f32_scale_inplace(xs: &mut [f32], k: f32) { + use core::arch::wasm32::*; + let n = xs.len(); + let n4 = n & !3; + let kv = f32x4_splat(k); + let xp = xs.as_mut_ptr(); + let mut i = 0usize; + while i < n4 { + unsafe { + let xv = v128_load(xp.add(i) as *const v128); + v128_store(xp.add(i) as *mut v128, f32x4_mul(xv, kv)); + } + i += 4; + } + while i < n { + unsafe { + *xp.add(i) *= k; + } + i += 1; + } +} + +#[cfg(not(target_arch = "wasm32"))] +#[inline] +fn f32_scale_inplace(xs: &mut [f32], k: f32) { + for x in xs.iter_mut() { + *x *= k; + } +} + +// Element-wise multiply: xs[i] *= ws[i] for the whole slice, in place. +#[cfg(target_arch = "wasm32")] +#[inline] +fn f32_mul_inplace(xs: &mut [f32], ws: &[f32]) { + use core::arch::wasm32::*; + debug_assert_eq!(xs.len(), ws.len()); + let n = xs.len(); + let n4 = n & !3; + let xp = xs.as_mut_ptr(); + let wp = ws.as_ptr(); + let mut i = 0usize; + while i < n4 { + unsafe { + let xv = v128_load(xp.add(i) as *const v128); + let wv = v128_load(wp.add(i) as *const v128); + v128_store(xp.add(i) as *mut v128, f32x4_mul(xv, wv)); + } + i += 4; + } + while i < n { + unsafe { + *xp.add(i) *= *wp.add(i); + } + i += 1; + } +} + +#[cfg(not(target_arch = "wasm32"))] +#[inline] +fn f32_mul_inplace(xs: &mut [f32], ws: &[f32]) { + debug_assert_eq!(xs.len(), ws.len()); + for (x, &w) in xs.iter_mut().zip(ws.iter()) { + *x *= w; + } +} + pub fn band_compr(out: &mut [f32], x: &[f32], erb_fb: &[usize]) { for y in out.iter_mut() { *y = 0.0; @@ -398,12 +514,18 @@ fn interp_band_gain(out: &mut [f32], band_e: &[f32], erb_fb: &[usize]) { } fn apply_band_gain(out: &mut [Complex32], band_e: &[f32], erb_fb: &[usize]) { - let mut bcsum = 0; - for (&band_size, b) in erb_fb.iter().zip(band_e.iter()) { - for j in 0..band_size { - let idx = bcsum + j; - out[idx] *= *b; - } + // Reinterpret &mut [Complex32] as &mut [f32] of length 2*N. Complex32 is + // #[repr(C)] { re: f32, im: f32 }: 8 bytes, alignment 4 — identical layout + // to two contiguous f32. Multiplying each Complex32 by a real f32 scalar `b` + // is equivalent to multiplying every f32 lane by `b`. + let n = out.len(); + let outf: &mut [f32] = + unsafe { core::slice::from_raw_parts_mut(out.as_mut_ptr() as *mut f32, n * 2) }; + let mut bcsum = 0usize; + for (&band_size, &b) in erb_fb.iter().zip(band_e.iter()) { + let f_start = bcsum * 2; + let f_len = band_size * 2; + f32_scale_inplace(&mut outf[f_start..f_start + f_len], b); bcsum += band_size; } } @@ -495,13 +617,9 @@ fn apply_window(xs: &[f32], window: &[f32]) -> Vec { out } -fn apply_window_in_place<'a, I>(xs: &mut [f32], window: I) -where - I: IntoIterator, -{ - for (x, &w) in xs.iter_mut().zip(window) { - *x *= w; - } +fn apply_window_in_place(xs: &mut [f32], window: &[f32]) { + debug_assert_eq!(xs.len(), window.len()); + f32_mul_inplace(xs, window); } pub fn post_filter(noisy: &[Complex32], enh: &mut [Complex32], beta: f32) { From 1084fe3d68822046d8bf478e94cd6859c116e571 Mon Sep 17 00:00:00 2001 From: Ckristian Zoli Date: Sun, 3 May 2026 21:47:39 +0100 Subject: [PATCH 09/10] perf(wasm): SIMD-vectorize 2 more DSP loops (band_unit_norm + _t) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds f32x4 SIMD for the two band-unit-norm functions in feat_cplx / feat_cplx_t (called per frame inside df_process_frame). The trick is de-interleaving &mut [Complex32]'s [re,im,re,im,...] layout so we can compute the per-bin norm (sqrt(re^2 + im^2)) lane-wise. Strategy: load 4 Complex32 (8 f32) as 2 v128s, use i32x4_shuffle to build pure-real and pure-imag vectors, compute norm in 4-wide SIMD, update state, then divide xs by sqrt(state). * band_unit_norm (xs: &mut [Complex32]) — re-interleaves the per-bin sqrt(state) divisor via two i32x4_shuffles to match the [re,im,re,im] xs layout, then divides 4 Complex32 (8 f32) at a time. * band_unit_norm_t (xs: &[Complex32], out: &mut [f32]) — same norm computation but writes to o_re / o_im split halves of out (CONTIGUOUS), so no re-interleave step is needed for the divide. Used (re*re + im*im).sqrt() instead of Complex32::norm()'s libm hypot. For DFN3's audio-spectrum magnitudes (no overflow/underflow regime), both produce identical bits — verified by FNV-1a hash of df_process_frame output stream over N=3000 deterministic random frames matching baseline exactly across 5 independent runs on Node v20.11.1 / V8. Wasm size delta: +678 bytes vs the 4-function bundle commit. Total over no-SIMD baseline: +1513 bytes for all 6 vectorisations. Co-Authored-By: Claude Opus 4.7 (1M context) --- libDF/src/lib.rs | 172 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 158 insertions(+), 14 deletions(-) diff --git a/libDF/src/lib.rs b/libDF/src/lib.rs index 357e318e5..853d5f61d 100644 --- a/libDF/src/lib.rs +++ b/libDF/src/lib.rs @@ -250,10 +250,7 @@ pub fn band_mean_norm_erb(xs: &mut [f32], state: &mut [f32], alpha: f32) { pub fn band_unit_norm(xs: &mut [Complex32], state: &mut [f32], alpha: f32) { debug_assert_eq!(xs.len(), state.len()); - for (x, s) in xs.iter_mut().zip(state.iter_mut()) { - *s = x.norm() * (1. - alpha) + *s * alpha; - *x /= s.sqrt(); - } + band_unit_norm_inner(xs, state, alpha); } /// Band unit norm, but with transposed output type. I.e. out contains first all real elements, @@ -263,16 +260,7 @@ pub fn band_unit_norm_t(xs: &[Complex32], state: &mut [f32], alpha: f32, out: &m debug_assert_eq!(xs.len(), state.len()); debug_assert_eq!(xs.len(), out.len() / 2); let (o_re, o_im) = out.split_at_mut(xs.len()); - for (x, s, o_re, o_im) in izip!( - xs.iter(), - state.iter_mut(), - o_re.iter_mut(), - o_im.iter_mut(), - ) { - *s = x.norm() * (1. - alpha) + *s * alpha; - *o_re /= s.sqrt(); - *o_im /= s.sqrt(); - } + band_unit_norm_t_inner(xs, state, alpha, o_re, o_im); } pub fn compute_band_corr(out: &mut [f32], x: &[Complex32], p: &[Complex32], erb_fb: &[usize]) { @@ -471,6 +459,162 @@ fn f32_mul_inplace(xs: &mut [f32], ws: &[f32]) { } } +// IIR per-bin unit-norm on interleaved Complex32: +// state[i] = sqrt(re[i]^2 + im[i]^2) * (1 - α) + state[i] * α; +// xs[i] /= sqrt(state[i]) (Complex32 / f32 = each component / f32) +// +// SIMD path processes 4 Complex32 per iteration. The interleaved layout +// [re0,im0,re1,im1,re2,im2,re3,im3] is loaded as two v128s, de-interleaved +// via i32x4_shuffle into pure-real and pure-imag vectors so the norm can be +// computed lane-wise. The normalisation step then divides each Complex32 +// component by sqrt(state[i]) by re-interleaving the divisor. +#[cfg(target_arch = "wasm32")] +#[inline] +fn band_unit_norm_inner(xs: &mut [Complex32], state: &mut [f32], alpha: f32) { + use core::arch::wasm32::*; + debug_assert_eq!(xs.len(), state.len()); + let n = xs.len(); + let n4 = n & !3; + let one_minus_a = f32x4_splat(1.0 - alpha); + let alpha_v = f32x4_splat(alpha); + let xf = xs.as_mut_ptr() as *mut f32; + let sp = state.as_mut_ptr(); + let mut i = 0usize; + while i < n4 { + // SAFETY: i < n4 <= n, and Complex32 is #[repr(C)] {re: f32, im: f32}, + // so xs as &mut [f32] of length 2N is valid. v128_load is unaligned. + unsafe { + let lo = v128_load(xf.add(i * 2) as *const v128); + let hi = v128_load(xf.add(i * 2 + 4) as *const v128); + // De-interleave: re_v = [re0, re1, re2, re3], im_v = [im0, im1, im2, im3] + let re_v = i32x4_shuffle::<0, 2, 4, 6>(lo, hi); + let im_v = i32x4_shuffle::<1, 3, 5, 7>(lo, hi); + // norm = sqrt(re² + im²) (note: this is (re²+im²).sqrt(), not libm hypot) + let norm_sq = f32x4_add(f32x4_mul(re_v, re_v), f32x4_mul(im_v, im_v)); + let norm_v = f32x4_sqrt(norm_sq); + // state update + let sv = v128_load(sp.add(i) as *const v128); + let new_s = f32x4_add(f32x4_mul(norm_v, one_minus_a), f32x4_mul(sv, alpha_v)); + v128_store(sp.add(i) as *mut v128, new_s); + // xs /= sqrt(state): build duplicated divisor per Complex32 + // for lo: [sqrt_s0, sqrt_s0, sqrt_s1, sqrt_s1] + // for hi: [sqrt_s2, sqrt_s2, sqrt_s3, sqrt_s3] + let sqrt_s = f32x4_sqrt(new_s); + let div_lo = i32x4_shuffle::<0, 0, 1, 1>(sqrt_s, sqrt_s); + let div_hi = i32x4_shuffle::<2, 2, 3, 3>(sqrt_s, sqrt_s); + v128_store(xf.add(i * 2) as *mut v128, f32x4_div(lo, div_lo)); + v128_store(xf.add(i * 2 + 4) as *mut v128, f32x4_div(hi, div_hi)); + } + i += 4; + } + // Tail: 0..3 trailing Complex32. Use the SAME (re²+im²).sqrt() as the SIMD + // path (NOT Complex32::norm() which is libm hypot) so vectorised + tail + // produce identical results across the full length. + while i < n { + unsafe { + let xi_re = *xf.add(i * 2); + let xi_im = *xf.add(i * 2 + 1); + let norm = (xi_re * xi_re + xi_im * xi_im).sqrt(); + let new_s = norm * (1.0 - alpha) + *sp.add(i) * alpha; + *sp.add(i) = new_s; + let sqrt_s = new_s.sqrt(); + *xf.add(i * 2) = xi_re / sqrt_s; + *xf.add(i * 2 + 1) = xi_im / sqrt_s; + } + i += 1; + } +} + +#[cfg(not(target_arch = "wasm32"))] +#[inline] +fn band_unit_norm_inner(xs: &mut [Complex32], state: &mut [f32], alpha: f32) { + for (x, s) in xs.iter_mut().zip(state.iter_mut()) { + *s = x.norm() * (1. - alpha) + *s * alpha; + *x /= s.sqrt(); + } +} + +// Same IIR norm as band_unit_norm but writes to o_re / o_im split halves of +// the output (xs read-only). The output halves are CONTIGUOUS so no +// re-interleave step is needed for the divide — simpler than band_unit_norm. +#[cfg(target_arch = "wasm32")] +#[inline] +fn band_unit_norm_t_inner( + xs: &[Complex32], + state: &mut [f32], + alpha: f32, + o_re: &mut [f32], + o_im: &mut [f32], +) { + use core::arch::wasm32::*; + debug_assert_eq!(xs.len(), state.len()); + debug_assert_eq!(xs.len(), o_re.len()); + debug_assert_eq!(xs.len(), o_im.len()); + let n = xs.len(); + let n4 = n & !3; + let one_minus_a = f32x4_splat(1.0 - alpha); + let alpha_v = f32x4_splat(alpha); + let xf = xs.as_ptr() as *const f32; + let sp = state.as_mut_ptr(); + let rp = o_re.as_mut_ptr(); + let ip = o_im.as_mut_ptr(); + let mut i = 0usize; + while i < n4 { + unsafe { + let lo = v128_load(xf.add(i * 2) as *const v128); + let hi = v128_load(xf.add(i * 2 + 4) as *const v128); + let re_v = i32x4_shuffle::<0, 2, 4, 6>(lo, hi); + let im_v = i32x4_shuffle::<1, 3, 5, 7>(lo, hi); + let norm_sq = f32x4_add(f32x4_mul(re_v, re_v), f32x4_mul(im_v, im_v)); + let norm_v = f32x4_sqrt(norm_sq); + let sv = v128_load(sp.add(i) as *const v128); + let new_s = f32x4_add(f32x4_mul(norm_v, one_minus_a), f32x4_mul(sv, alpha_v)); + v128_store(sp.add(i) as *mut v128, new_s); + let sqrt_s = f32x4_sqrt(new_s); + // o_re / o_im are stored contiguously, divide directly + let or_v = v128_load(rp.add(i) as *const v128); + let oi_v = v128_load(ip.add(i) as *const v128); + v128_store(rp.add(i) as *mut v128, f32x4_div(or_v, sqrt_s)); + v128_store(ip.add(i) as *mut v128, f32x4_div(oi_v, sqrt_s)); + } + i += 4; + } + while i < n { + unsafe { + let xi_re = *xf.add(i * 2); + let xi_im = *xf.add(i * 2 + 1); + let norm = (xi_re * xi_re + xi_im * xi_im).sqrt(); + let new_s = norm * (1.0 - alpha) + *sp.add(i) * alpha; + *sp.add(i) = new_s; + let sqrt_s = new_s.sqrt(); + *rp.add(i) /= sqrt_s; + *ip.add(i) /= sqrt_s; + } + i += 1; + } +} + +#[cfg(not(target_arch = "wasm32"))] +#[inline] +fn band_unit_norm_t_inner( + xs: &[Complex32], + state: &mut [f32], + alpha: f32, + o_re: &mut [f32], + o_im: &mut [f32], +) { + for (x, s, o_re, o_im) in izip!( + xs.iter(), + state.iter_mut(), + o_re.iter_mut(), + o_im.iter_mut(), + ) { + *s = x.norm() * (1. - alpha) + *s * alpha; + *o_re /= s.sqrt(); + *o_im /= s.sqrt(); + } +} + pub fn band_compr(out: &mut [f32], x: &[f32], erb_fb: &[usize]) { for y in out.iter_mut() { *y = 0.0; From 143b040471f38e860853f5e64116bb67ea48d793 Mon Sep 17 00:00:00 2001 From: Ckristian Zoli Date: Mon, 4 May 2026 09:25:35 +0100 Subject: [PATCH 10/10] perf(wasm): SIMD-vectorize 3 more frame_synthesis loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three more loops in frame_synthesis emit scalar code on wasm32 despite +simd128 (unlike the frame_analysis windowing loops which LLVM auto-vec'd; something about the nested zip().zip() iterator pattern in frame_synthesis vs the izip!() pattern in frame_analysis defeats auto-vectorization). Three changes: * out[i] = x_first[i] + synthesis_mem[i] (overlap-add to output) — new f32_add_to(a, b, out) helper, three-slice element-wise add via 4-wide v128 + f32x4_add. * s_first[i] += xs_first[i] (overlap-add for next frame, in-place) — new f32_add_inplace(xs, ys) helper, two-slice element-wise in-place add. * s_second[i] = xs_second[i] (override left-shifted buffer) — replaced the explicit loop with copy_from_slice; the compiler likely emitted memcpy already, but the stdlib idiom is clearer and lets the optimiser pick the best implementation. Bit-identical output verified: FNV-1a hash 53ae8dfc3595faf0 unchanged across N=3000 deterministic frames over 6 independent bench runs. Speed: median bundle_synth vs the previous 6-function bundle is -1.2% RTF; mean over 6 iters is -3.1%. Several runs showed -5% to -11% additional gain (those runs had background CPU activity that hit the previous bundle harder). Real direction, modest absolute gain, no quality cost. Wasm size delta: -24 bytes vs previous bundle (copy_from_slice emits less code than the explicit loop). Net total: +1489 bytes over the no-SIMD baseline for all 8 vectorisations. Co-Authored-By: Claude Opus 4.7 (1M context) --- libDF/src/lib.rs | 99 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 87 insertions(+), 12 deletions(-) diff --git a/libDF/src/lib.rs b/libDF/src/lib.rs index 853d5f61d..a8f8d361b 100644 --- a/libDF/src/lib.rs +++ b/libDF/src/lib.rs @@ -459,6 +459,81 @@ fn f32_mul_inplace(xs: &mut [f32], ws: &[f32]) { } } +// Three-slice element-wise add: out[i] = a[i] + b[i]. +#[cfg(target_arch = "wasm32")] +#[inline] +fn f32_add_to(a: &[f32], b: &[f32], out: &mut [f32]) { + use core::arch::wasm32::*; + debug_assert_eq!(a.len(), b.len()); + debug_assert_eq!(a.len(), out.len()); + let n = a.len(); + let n4 = n & !3; + let ap = a.as_ptr(); + let bp = b.as_ptr(); + let op = out.as_mut_ptr(); + let mut i = 0usize; + while i < n4 { + unsafe { + let av = v128_load(ap.add(i) as *const v128); + let bv = v128_load(bp.add(i) as *const v128); + v128_store(op.add(i) as *mut v128, f32x4_add(av, bv)); + } + i += 4; + } + while i < n { + unsafe { + *op.add(i) = *ap.add(i) + *bp.add(i); + } + i += 1; + } +} + +#[cfg(not(target_arch = "wasm32"))] +#[inline] +fn f32_add_to(a: &[f32], b: &[f32], out: &mut [f32]) { + debug_assert_eq!(a.len(), b.len()); + debug_assert_eq!(a.len(), out.len()); + for ((&x, &y), o) in a.iter().zip(b.iter()).zip(out.iter_mut()) { + *o = x + y; + } +} + +// In-place element-wise add: xs[i] += ys[i]. +#[cfg(target_arch = "wasm32")] +#[inline] +fn f32_add_inplace(xs: &mut [f32], ys: &[f32]) { + use core::arch::wasm32::*; + debug_assert_eq!(xs.len(), ys.len()); + let n = xs.len(); + let n4 = n & !3; + let xp = xs.as_mut_ptr(); + let yp = ys.as_ptr(); + let mut i = 0usize; + while i < n4 { + unsafe { + let xv = v128_load(xp.add(i) as *const v128); + let yv = v128_load(yp.add(i) as *const v128); + v128_store(xp.add(i) as *mut v128, f32x4_add(xv, yv)); + } + i += 4; + } + while i < n { + unsafe { + *xp.add(i) += *yp.add(i); + } + i += 1; + } +} + +#[cfg(not(target_arch = "wasm32"))] +#[inline] +fn f32_add_inplace(xs: &mut [f32], ys: &[f32]) { + debug_assert_eq!(xs.len(), ys.len()); + for (x, &y) in xs.iter_mut().zip(ys.iter()) { + *x += y; + } +} + // IIR per-bin unit-norm on interleaved Complex32: // state[i] = sqrt(re[i]^2 + im[i]^2) * (1 - α) + state[i] * α; // xs[i] /= sqrt(state[i]) (Complex32 / f32 = each component / f32) @@ -732,10 +807,12 @@ fn frame_synthesis(input: &mut [Complex32], output: &mut [f32], state: &mut DFSt } apply_window_in_place(&mut x, &state.window); let (x_first, x_second) = x.split_at(state.frame_size); - for ((&xi, &mem), out) in x_first.iter().zip(state.synthesis_mem.iter()).zip(output.iter_mut()) - { - *out = xi + mem; - } + // out[i] = x_first[i] + synthesis_mem[i] (zip-3 stops at shortest; + // x_first.len() == output.len() == frame_size; synthesis_mem may be longer). + let n_out = output.len(); + debug_assert_eq!(x_first.len(), n_out); + debug_assert!(state.synthesis_mem.len() >= n_out); + f32_add_to(x_first, &state.synthesis_mem[..n_out], output); let split = state.synthesis_mem.len() - state.frame_size; if split > 0 { @@ -743,14 +820,12 @@ fn frame_synthesis(input: &mut [Complex32], output: &mut [f32], state: &mut DFSt } let (s_first, s_second) = state.synthesis_mem.split_at_mut(split); let (xs_first, xs_second) = x_second.split_at(split); - for (&xi, mem) in xs_first.iter().zip(s_first.iter_mut()) { - // Overlap add for next frame - *mem += xi; - } - for (&xi, mem) in xs_second.iter().zip(s_second.iter_mut()) { - // Override left shifted buffer - *mem = xi; - } + // Overlap-add for next frame: s_first[i] += xs_first[i]. + let n_first = xs_first.len().min(s_first.len()); + f32_add_inplace(&mut s_first[..n_first], &xs_first[..n_first]); + // Override left-shifted buffer: s_second[i] = xs_second[i] (memcpy-shaped). + let n_second = xs_second.len().min(s_second.len()); + s_second[..n_second].copy_from_slice(&xs_second[..n_second]); } fn apply_window(xs: &[f32], window: &[f32]) -> Vec {