diff --git a/linalg/src/cache.rs b/linalg/src/cache.rs
new file mode 100644
index 0000000000..01a1d33925
--- /dev/null
+++ b/linalg/src/cache.rs
@@ -0,0 +1,378 @@
+//! Best-effort runtime CPU data-cache geometry detection.
+//!
+//! Cache blocking (panel-block sizing in `mmm`, im2col lowering thresholds, …)
+//! is only correct when the block budget is derived from the *actual* cache the
+//! code runs on, not a hard-coded constant. This module centralises that
+//! detection so every heuristic reads the same memoised numbers instead of each
+//! re-implementing a platform probe.
+//!
+//! All sizes are **bytes**, with `0` meaning "could not detect on this platform"
+//! — callers must treat `0` as unknown and fall back conservatively (never
+//! over-block a cache you cannot see). The raw fields stay honest; the
+//! `*_or_default` helpers apply an architecture-based guess for callers that
+//! prefer a number to a zero.
+//!
+//! Detection is done once, lazily, and cached for the process lifetime.
+
+use std::sync::OnceLock;
+
+/// Detected data-cache sizes in bytes. `0` == unknown on this platform.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
+pub struct CacheInfo {
+    /// L1 data cache (per core), bytes. `0` if unknown.
+    pub l1_data: usize,
+    /// L2 cache (per perf-core / cluster), bytes. `0` if unknown.
+    pub l2: usize,
+    /// L3 / last-level cache, bytes. `0` if unknown.
+    pub l3: usize,
+}
+
+impl CacheInfo {
+    /// L1 data cache, or an architecture-based guess when undetected
+    /// (64 KiB on arm64, 32 KiB elsewhere — matches common silicon).
+    pub fn l1_data_or_default(&self) -> usize {
+        if self.l1_data > 0 {
+            self.l1_data
+        } else if cfg!(target_arch = "aarch64") {
+            64 * 1024
+        } else {
+            32 * 1024
+        }
+    }
+
+    /// L2 cache, or a conservative 256 KiB guess when undetected.
+    pub fn l2_or_default(&self) -> usize {
+        if self.l2 > 0 { self.l2 } else { 256 * 1024 }
+    }
+}
+
+/// Memoised cache geometry for the current machine. Detected once on first call.
+pub fn cache_info() -> CacheInfo {
+    static CACHE: OnceLock<CacheInfo> = OnceLock::new();
+    *CACHE.get_or_init(detect)
+}
+
+/// Where the last-level cache used for the outer GEMM blocking tier comes from,
+/// which implies how aggressively a single thread may budget it.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum LlcKind {
+    /// Architectural cluster L3 (or an operator-provided size) — effectively
+    /// private to the CPU, so a single thread can assume most of it.
+    Dedicated,
+    /// System-Level Cache: an interconnect cache shared with the GPU/NPU/display
+    /// (e.g. Qualcomm LLCC, Apple SLC). Contended — budget it conservatively.
+    SystemLevel,
+}
+
+/// Size (bytes) and kind of the last-level cache to size the outer GEMM blocking
+/// tier against, or `None` when nothing usefully larger than L2 is known.
+///
+/// Resolution order (first hit wins):
+///  1. `TRACT_LLC_BYTES` env override (e.g. `"8M"`, `"33554432"`) — for embedders
+///     who know their SoC's LLC/SLC when the OS doesn't expose it. Marked
+///     [`LlcKind::SystemLevel`] iff `TRACT_LLC_CONTENDED` is set, else `Dedicated`.
+///  2. architecturally-detected L3 ([`CacheInfo::l3`]) when it exceeds L2 — `Dedicated`.
+///  3. a System-Level Cache discovered via the Linux devicetree (`cache-level == 3`
+///     with a `cache-size`, outside `/cpus`) — `SystemLevel`.
+///
+/// The per-CPU `cpu/cache/index*` topology the L3 probe reads does **not** list an
+/// SLC (it's a separate interconnect IP), which is why an SLC needs a distinct
+/// source. Prior art — runtime cache sizing: Eigen `queryCacheSizes` (CPUID/sysctl),
+/// glibc `sysconf(_SC_LEVELx_CACHE_SIZE)`, ACPI PPTT, hwloc. SLC exposure: Qualcomm
+/// LLCC (`drivers/soc/qcom/llcc-qcom.c`, devicetree `qcom,llcc`) and the generic
+/// devicetree cache bindings.
+pub fn last_level_cache() -> Option<(usize, LlcKind)> {
+    let ci = cache_info();
+    resolve_llc(
+        env_llc_override(),
+        std::env::var_os("TRACT_LLC_CONTENDED").is_some(),
+        ci.l2,
+        ci.l3,
+        system_level_cache_bytes(),
+    )
+}
+
+/// Pure resolution of [`last_level_cache`] (factored out so it is testable without
+/// touching process-global env / hardware).
+fn resolve_llc(
+    override_bytes: Option<usize>,
+    override_contended: bool,
+    l2: usize,
+    l3: usize,
+    slc: usize,
+) -> Option<(usize, LlcKind)> {
+    if let Some(b) = override_bytes.filter(|b| *b > 0) {
+        let kind = if override_contended { LlcKind::SystemLevel } else { LlcKind::Dedicated };
+        return Some((b, kind));
+    }
+    if l3 > l2 {
+        return Some((l3, LlcKind::Dedicated));
+    }
+    if slc > l2 && slc > 0 {
+        return Some((slc, LlcKind::SystemLevel));
+    }
+    None
+}
+
+fn env_llc_override() -> Option<usize> {
+    let b = parse_cache_size(&std::env::var("TRACT_LLC_BYTES").ok()?);
+    (b > 0).then_some(b)
+}
+
+/// Best-effort System-Level Cache size (bytes) from the Linux devicetree: the
+/// largest node carrying `cache-level == 3` *and* a `cache-size`, outside the
+/// `/cpus` subtree (so it is an interconnect cache, not a CPU cache the L3 probe
+/// already saw). Returns `0` when unavailable — e.g. SLCs whose size is fixed in
+/// the controller (Qualcomm LLCC) carry no `cache-size` here, so those still rely
+/// on the `TRACT_LLC_BYTES` override.
+#[cfg(any(target_os = "linux", target_os = "android"))]
+fn system_level_cache_bytes() -> usize {
+    use std::path::Path;
+    fn be_u32(p: &Path) -> Option<u32> {
+        let b = std::fs::read(p).ok()?;
+        (b.len() >= 4).then(|| u32::from_be_bytes([b[0], b[1], b[2], b[3]]))
+    }
+    fn walk(dir: &Path, depth: usize, best: &mut usize) {
+        if depth == 0 {
+            return;
+        }
+        if be_u32(&dir.join("cache-level")) == Some(3) {
+            let sz = be_u32(&dir.join("cache-size")).unwrap_or(0) as usize;
+            *best = (*best).max(sz);
+        }
+        let Ok(rd) = std::fs::read_dir(dir) else { return };
+        for e in rd.flatten() {
+            let p = e.path();
+            // CPU caches are handled by the architectural L3 probe; skip them.
+            if p.is_dir() && p.file_name().and_then(|n| n.to_str()) != Some("cpus") {
+                walk(&p, depth - 1, best);
+            }
+        }
+    }
+    let mut best = 0;
+    for root in ["/proc/device-tree", "/sys/firmware/devicetree/base"] {
+        let p = Path::new(root);
+        if p.exists() {
+            walk(p, 4, &mut best);
+            if best > 0 {
+                break;
+            }
+        }
+    }
+    best
+}
+
+#[cfg(not(any(target_os = "linux", target_os = "android")))]
+fn system_level_cache_bytes() -> usize {
+    0
+}
+
+/// Parse a Linux `/sys` cache `size` string (e.g. `"256K"`, `"8M"`, `"512"`).
+#[cfg_attr(not(any(target_os = "linux", target_os = "android")), allow(dead_code))]
+fn parse_cache_size(s: &str) -> usize {
+    let s = s.trim();
+    let (num, mult) = if let Some(n) = s.strip_suffix(['K', 'k']) {
+        (n, 1024)
+    } else if let Some(n) = s.strip_suffix(['M', 'm']) {
+        (n, 1024 * 1024)
+    } else {
+        (s, 1)
+    };
+    num.trim().parse::<usize>().unwrap_or(0) * mult
+}
+
+#[cfg(any(target_os = "macos", target_os = "ios"))]
+fn detect() -> CacheInfo {
+    // Read a scalar `hw.*` sysctl by name via the libc FFI (no subprocess).
+    // macOS returns these as a little-endian integer (4 or 8 bytes); a zeroed
+    // 8-byte buffer reads either width correctly on little-endian Apple silicon
+    // and Intel.
+    fn sysctl_usize(name: &str) -> Option<usize> {
+        use std::ffi::CString;
+        use std::os::raw::{c_char, c_int, c_void};
+        unsafe extern "C" {
+            fn sysctlbyname(
+                name: *const c_char,
+                oldp: *mut c_void,
+                oldlenp: *mut usize,
+                newp: *mut c_void,
+                newlen: usize,
+            ) -> c_int;
+        }
+        let cname = CString::new(name).ok()?;
+        let mut val: u64 = 0;
+        let mut len = std::mem::size_of::<u64>();
+        let rc = unsafe {
+            sysctlbyname(
+                cname.as_ptr(),
+                &mut val as *mut u64 as *mut c_void,
+                &mut len,
+                std::ptr::null_mut(),
+                0,
+            )
+        };
+        if rc != 0 || val == 0 { None } else { Some(val as usize) }
+    }
+
+    CacheInfo {
+        // perflevel0 is the performance cluster on hybrid Apple Silicon.
+        l1_data: sysctl_usize("hw.perflevel0.l1dcachesize")
+            .or_else(|| sysctl_usize("hw.l1dcachesize"))
+            .unwrap_or(0),
+        l2: sysctl_usize("hw.perflevel0.l2cachesize")
+            .or_else(|| sysctl_usize("hw.l2cachesize"))
+            .unwrap_or(0),
+        l3: sysctl_usize("hw.perflevel0.l3cachesize")
+            .or_else(|| sysctl_usize("hw.l3cachesize"))
+            .unwrap_or(0),
+    }
+}
+
+#[cfg(any(target_os = "linux", target_os = "android"))]
+fn detect() -> CacheInfo {
+    // Walk /sys/.../cache/indexN, keying off the reported level+type rather than
+    // assuming a fixed index layout (it varies: SMT, unified vs split L2, …).
+    let mut ci = CacheInfo::default();
+    for idx in 0..16 {
+        let base = format!("/sys/devices/system/cpu/cpu0/cache/index{idx}/");
+        let Ok(level) = std::fs::read_to_string(format!("{base}level")) else {
+            continue;
+        };
+        let level: usize = level.trim().parse().unwrap_or(0);
+        let ctype = std::fs::read_to_string(format!("{base}type"))
+            .unwrap_or_default()
+            .trim()
+            .to_ascii_lowercase();
+        let size = std::fs::read_to_string(format!("{base}size"))
+            .map(|s| parse_cache_size(&s))
+            .unwrap_or(0);
+        if size == 0 {
+            continue;
+        }
+        match level {
+            1 if ctype == "data" || ctype == "unified" => {
+                if ci.l1_data == 0 {
+                    ci.l1_data = size;
+                }
+            }
+            2 if ci.l2 == 0 => ci.l2 = size,
+            3 if ci.l3 == 0 => ci.l3 = size,
+            _ => {}
+        }
+    }
+    ci
+}
+
+#[cfg(target_os = "windows")]
+fn detect() -> CacheInfo {
+    // wmic only reports L2/L3 (in KiB) and is deprecated on Win11; it is the
+    // dependency-free option. L1 is left unknown (→ l1_data_or_default).
+    // A future GetLogicalProcessorInformationEx probe would also yield L1.
+    let mut ci = CacheInfo::default();
+    if let Ok(out) = std::process::Command::new("wmic")
+        .args(["cpu", "get", "L2CacheSize,L3CacheSize", "/format:value"])
+        .output()
+    {
+        for line in String::from_utf8_lossy(&out.stdout).lines() {
+            let line = line.trim();
+            if let Some(v) = line.strip_prefix("L2CacheSize=") {
+                if let Ok(kb) = v.trim().parse::<usize>() {
+                    ci.l2 = kb * 1024;
+                }
+            } else if let Some(v) = line.strip_prefix("L3CacheSize=") {
+                if let Ok(kb) = v.trim().parse::<usize>() {
+                    ci.l3 = kb * 1024;
+                }
+            }
+        }
+    }
+    ci
+}
+
+#[cfg(not(any(
+    target_os = "macos",
+    target_os = "ios",
+    target_os = "linux",
+    target_os = "android",
+    target_os = "windows"
+)))]
+fn detect() -> CacheInfo {
+    // WASM, BSDs, etc.: no portable probe — report unknown, callers fall back.
+    CacheInfo::default()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn llc_resolution_priority() {
+        // override wins over everything; contended flag selects the kind.
+        assert_eq!(
+            resolve_llc(Some(8 << 20), false, 1 << 20, 4 << 20, 0),
+            Some((8 << 20, LlcKind::Dedicated))
+        );
+        assert_eq!(
+            resolve_llc(Some(8 << 20), true, 1 << 20, 0, 0),
+            Some((8 << 20, LlcKind::SystemLevel))
+        );
+        // no override: architectural L3 (> L2) is Dedicated.
+        assert_eq!(
+            resolve_llc(None, false, 1 << 20, 4 << 20, 0),
+            Some((4 << 20, LlcKind::Dedicated))
+        );
+        // no L3, but an SLC > L2 is reported: SystemLevel (contended).
+        assert_eq!(
+            resolve_llc(None, false, 512 << 10, 0, 4 << 20),
+            Some((4 << 20, LlcKind::SystemLevel))
+        );
+        // nothing larger than L2 known ⇒ no outer tier (regression-safe).
+        assert_eq!(resolve_llc(None, false, 1 << 20, 0, 0), None);
+        assert_eq!(resolve_llc(None, false, 1 << 20, 1 << 20, 512 << 10), None);
+        // a zero/garbage override is ignored, falling through to detection.
+        assert_eq!(
+            resolve_llc(Some(0), false, 1 << 20, 4 << 20, 0),
+            Some((4 << 20, LlcKind::Dedicated))
+        );
+    }
+
+    #[test]
+    fn slc_probe_never_panics() {
+        // On the test host this is typically 0 (no devicetree SLC); just exercise it.
+        let _ = system_level_cache_bytes();
+        let _ = last_level_cache();
+    }
+
+    #[test]
+    fn parse_cache_size_units() {
+        assert_eq!(parse_cache_size("512"), 512);
+        assert_eq!(parse_cache_size("256K"), 256 * 1024);
+        assert_eq!(parse_cache_size("8M"), 8 * 1024 * 1024);
+        assert_eq!(parse_cache_size(" 1024k "), 1024 * 1024);
+        assert_eq!(parse_cache_size("garbage"), 0);
+    }
+
+    #[test]
+    fn defaults_are_nonzero() {
+        let unknown = CacheInfo::default();
+        assert!(unknown.l1_data_or_default() >= 32 * 1024);
+        assert_eq!(unknown.l2_or_default(), 256 * 1024);
+    }
+
+    #[test]
+    fn detected_values_are_sane_when_present() {
+        // Detection must never panic and must be self-consistent: any level it
+        // *does* report should be a plausible power-of-two-ish cache size, and
+        // L1 <= L2 <= L3 when all are known.
+        let ci = cache_info();
+        for (name, v) in [("l1d", ci.l1_data), ("l2", ci.l2), ("l3", ci.l3)] {
+            assert!(v == 0 || (1024..=512 * 1024 * 1024).contains(&v), "{name} implausible: {v}");
+        }
+        if ci.l1_data > 0 && ci.l2 > 0 {
+            assert!(ci.l1_data <= ci.l2, "L1 {} > L2 {}", ci.l1_data, ci.l2);
+        }
+        if ci.l2 > 0 && ci.l3 > 0 {
+            assert!(ci.l2 <= ci.l3, "L2 {} > L3 {}", ci.l2, ci.l3);
+        }
+    }
+}
diff --git a/linalg/src/frame/mmm/mod.rs b/linalg/src/frame/mmm/mod.rs
index bafc69826e..89ac9e9431 100644
--- a/linalg/src/frame/mmm/mod.rs
+++ b/linalg/src/frame/mmm/mod.rs
@@ -279,122 +279,127 @@ unsafe fn run_with_scratch_space_vec<K: MatMatMulKer>(
     }
 }
 
-/// Upper bound on the single-thread panel-block edge (matches the multithread
-/// `chunk_grid` default).
+/// Upper bound on the inner (L2-resident) panel-block edge (matches the
+/// multithread `chunk_grid` default).
 const ST_BLK_MAX: usize = 16;
 
-#[cfg(target_os = "linux")]
-fn parse_cache_size(s: &str) -> usize {
-    let s = s.trim();
-    let (num, mult) = if let Some(n) = s.strip_suffix(['K', 'k']) {
-        (n, 1024)
-    } else if let Some(n) = s.strip_suffix(['M', 'm']) {
-        (n, 1024 * 1024)
+/// Upper bound on the outer (L3-resident) super-block edge. 4× the inner cap so
+/// an L3 several times larger than L2 can hold a meaningfully bigger super-block.
+const ST_BLK_L3_MAX: usize = 64;
+
+/// Panel-block working-set budget (bytes) from a detected cache size: a fraction
+/// `num/den` of the cache (leaving room for the C accumulator tile + packing
+/// metadata), clamped to a sane range. `0` (cache unknown) ⇒ `fallback`, which
+/// is kept small so the block ≈ the naive loop and can never over-block a cache
+/// it can't see. Sizes come from the shared [`crate::cache`] probe.
+fn tier_budget_bytes(cache_bytes: usize, num: usize, den: usize, fallback: usize) -> usize {
+    if cache_bytes == 0 {
+        fallback
     } else {
-        (s, 1)
-    };
-    num.trim().parse::<usize>().unwrap_or(0) * mult
+        (cache_bytes * num / den).clamp(64 * 1024, 64 * 1024 * 1024)
+    }
 }
 
-/// Best-effort L2 data-cache size in bytes (per perf-core / cluster); 0 if
-/// unknown. Cached. Used to size the single-thread cache-block budget so it is
-/// correct across hardware instead of a hard-coded constant.
-fn detect_l2_bytes() -> usize {
-    static L2: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
-    *L2.get_or_init(|| {
-        #[cfg(target_os = "macos")]
-        {
-            let sysctl = |k: &str| -> Option<usize> {
-                let o = std::process::Command::new("sysctl").arg("-n").arg(k).output().ok()?;
-                if !o.status.success() {
-                    return None;
-                }
-                String::from_utf8_lossy(&o.stdout).trim().parse().ok()
-            };
-            // Prefer the performance-core L2 on hybrid Apple Silicon.
-            sysctl("hw.perflevel0.l2cachesize").or_else(|| sysctl("hw.l2cachesize")).unwrap_or(0)
-        }
-        #[cfg(target_os = "linux")]
-        {
-            // index2/index3 is typically the unified L2 (index0/1 are L1 d/i).
-            for idx in [2usize, 3] {
-                if let Ok(s) = std::fs::read_to_string(format!(
-                    "/sys/devices/system/cpu/cpu0/cache/index{idx}/size"
-                )) {
-                    let b = parse_cache_size(s.trim());
-                    if b > 0 {
-                        return b;
-                    }
-                }
-            }
-            0
-        }
-        #[cfg(not(any(target_os = "macos", target_os = "linux")))]
-        {
-            0
-        }
-    })
+/// Inner tier: ~a third of L2 (private per perf-core), 256 KiB fallback.
+fn l2_block_budget_bytes() -> usize {
+    tier_budget_bytes(crate::cache::cache_info().l2, 1, 3, 256 * 1024)
 }
 
-/// Working-set budget (bytes) for the single-thread cache-block: ~a third of L2
-/// (leaving room for the C accumulator tile + packing metadata). Conservative
-/// 256 KiB fallback when L2 is unknown (WASM/Windows/BSD) ⇒ small blocks ≈ the
-/// naive loop, so it can never over-block a cache it can't see.
-fn block_budget_bytes() -> usize {
-    let l2 = detect_l2_bytes();
-    if l2 == 0 { 256 * 1024 } else { (l2 / 3).clamp(64 * 1024, 8 * 1024 * 1024) }
+/// Outer tier: ~half of L3, but only when an L3 is detected and is meaningfully
+/// larger than L2 (otherwise an outer tier just duplicates the inner one).
+/// `None` ⇒ no outer tier; the walk stays single-level (identical to before).
+fn l3_block_budget_bytes() -> Option<usize> {
+    use crate::cache::LlcKind;
+    let (bytes, kind) = crate::cache::last_level_cache()?;
+    // Dedicated cluster L3: ~half. A shared System-Level Cache is contended by the
+    // GPU/NPU/display, so we can't assume residency of lines they keep evicting —
+    // budget it to ~a quarter.
+    let (num, den) = match kind {
+        LlcKind::Dedicated => (1, 2),
+        LlcKind::SystemLevel => (1, 4),
+    };
+    Some(tier_budget_bytes(bytes, num, den, 0))
 }
 
-/// Cache-adaptive panel-block edge: large enough to amortise streaming, small
-/// enough that the block's A+B sub-panels (`~blk·(mr+nr)·k·elem_bytes`) stay
-/// L2-resident at the given `k`. Capped at [`ST_BLK_MAX`]; the floor of 1
-/// degrades exactly to the naive loop, so an unknown/small cache can never
-/// over-block (regression-safe). The budget is **cache-size derived** (not a
-/// hard-coded constant), so it is correct across hardware.
+/// Cache-adaptive panel-block edge for a given byte budget: large enough to
+/// amortise streaming, small enough that the block's A+B sub-panels
+/// (`~blk·(mr+nr)·k·elem_bytes`) stay cache-resident at the given `k`. Capped at
+/// `cap`; the floor of 1 degrades exactly to the naive loop, so an unknown/small
+/// cache can never over-block (regression-safe).
 #[inline]
-fn st_block_edge(mr: usize, nr: usize, k: usize, elem_bytes: usize) -> usize {
+fn block_edge_for(
+    budget: usize,
+    mr: usize,
+    nr: usize,
+    k: usize,
+    elem_bytes: usize,
+    cap: usize,
+) -> usize {
     if k == 0 {
-        return ST_BLK_MAX;
+        return cap;
     }
     let per_blk = ((mr + nr) * k * elem_bytes.max(1)).max(1);
-    (block_budget_bytes() / per_blk).clamp(1, ST_BLK_MAX)
+    (budget / per_blk).clamp(1, cap)
 }
 
-/// Single-thread tile walk over the `m_panels × n_panels` grid, blocked into
-/// cache-sized panel blocks for locality (the naive nested loop re-streams the
-/// whole inner operand per outer panel at large k; the multithread path already
-/// blocks this way via `chunk_grid`). `col_outer` selects the within-block inner
-/// order (B-reuse vs A-reuse). Reordering independent tiles changes no result —
-/// bit-exact with the naive loop.
+/// Inner (L2) panel-block edge. Budget is **cache-size derived** (not a
+/// hard-coded constant), so it is correct across hardware.
 #[inline]
-unsafe fn run_single_thread_blocked<K: MatMatMulKer>(
-    ker: &K,
+fn st_block_edge(mr: usize, nr: usize, k: usize, elem_bytes: usize) -> usize {
+    block_edge_for(l2_block_budget_bytes(), mr, nr, k, elem_bytes, ST_BLK_MAX)
+}
+
+/// Outer (L3) super-block edge, or `usize::MAX` (one block over the whole grid,
+/// i.e. no outer tier) when no usable L3 is detected. Never smaller than the
+/// inner edge `inner`.
+#[inline]
+fn st_outer_block_edge(mr: usize, nr: usize, k: usize, elem_bytes: usize, inner: usize) -> usize {
+    match l3_block_budget_bytes() {
+        Some(budget) => block_edge_for(budget, mr, nr, k, elem_bytes, ST_BLK_L3_MAX).max(inner),
+        None => usize::MAX,
+    }
+}
+
+/// Visit every `(ia, ib)` tile of the `m_panels × n_panels` grid exactly once,
+/// blocked two levels deep: an outer `blk_outer` super-block (L3-resident) holds
+/// inner `blk` blocks (L2-resident). `col_outer` selects the within-block inner
+/// order (B-reuse vs A-reuse). When `blk_outer >= max(m,n)` the outer loop runs
+/// once and this is exactly the single-level inner walk. Pure tile reordering ⇒
+/// no result changes; extracted so the nesting can be unit-tested independently
+/// of the kernel.
+#[inline]
+fn for_each_blocked_tile(
     m_panels: usize,
     n_panels: usize,
-    k: usize,
+    blk: usize,
+    blk_outer: usize,
     col_outer: bool,
-    scratch: &mut ScratchSpaceImpl<K::Acc>,
-    non_linear: &[FusedSpec],
+    mut f: impl FnMut(usize, usize) -> TractResult<()>,
 ) -> TractResult<()> {
-    unsafe {
-        let blk = st_block_edge(ker.mr(), ker.nr(), k, K::Acc::datum_type().size_of());
-        scratch.run_in_tls_scope(|scratch, tls| {
-            let mut jb = 0;
-            while jb < n_panels {
-                let jb_end = (jb + blk).min(n_panels);
-                let mut ja = 0;
-                while ja < m_panels {
-                    let ja_end = (ja + blk).min(m_panels);
+    let blk = blk.max(1);
+    let blk_outer = blk_outer.max(blk);
+    let mut jb3 = 0;
+    while jb3 < n_panels {
+        let jb3_end = jb3.saturating_add(blk_outer).min(n_panels);
+        let mut ja3 = 0;
+        while ja3 < m_panels {
+            let ja3_end = ja3.saturating_add(blk_outer).min(m_panels);
+            let mut jb = jb3;
+            while jb < jb3_end {
+                let jb_end = (jb + blk).min(jb3_end);
+                let mut ja = ja3;
+                while ja < ja3_end {
+                    let ja_end = (ja + blk).min(ja3_end);
                     if col_outer {
                         for ib in jb..jb_end {
                             for ia in ja..ja_end {
-                                scratch.run_one_tile(ker, non_linear, tls, ia, ib)?;
+                                f(ia, ib)?;
                             }
                         }
                     } else {
                         for ia in ja..ja_end {
                             for ib in jb..jb_end {
-                                scratch.run_one_tile(ker, non_linear, tls, ia, ib)?;
+                                f(ia, ib)?;
                             }
                         }
                     }
@@ -402,7 +407,37 @@ unsafe fn run_single_thread_blocked<K: MatMatMulKer>(
                 }
                 jb = jb_end;
             }
-            TractResult::Ok(())
+            ja3 = ja3_end;
+        }
+        jb3 = jb3_end;
+    }
+    Ok(())
+}
+
+/// Single-thread tile walk over the `m_panels × n_panels` grid, blocked into
+/// cache-sized panel blocks for locality (the naive nested loop re-streams the
+/// whole inner operand per outer panel at large k; the multithread path already
+/// blocks this way via `chunk_grid`). Two tiers: an inner L2-resident block and,
+/// where an L3 is detected, an outer L3-resident super-block. Reordering
+/// independent tiles changes no result — bit-exact with the naive loop.
+#[inline]
+unsafe fn run_single_thread_blocked<K: MatMatMulKer>(
+    ker: &K,
+    m_panels: usize,
+    n_panels: usize,
+    k: usize,
+    col_outer: bool,
+    scratch: &mut ScratchSpaceImpl<K::Acc>,
+    non_linear: &[FusedSpec],
+) -> TractResult<()> {
+    unsafe {
+        let elem = K::Acc::datum_type().size_of();
+        let blk = st_block_edge(ker.mr(), ker.nr(), k, elem);
+        let blk_outer = st_outer_block_edge(ker.mr(), ker.nr(), k, elem, blk);
+        scratch.run_in_tls_scope(|scratch, tls| {
+            for_each_blocked_tile(m_panels, n_panels, blk, blk_outer, col_outer, |ia, ib| {
+                scratch.run_one_tile(ker, non_linear, tls, ia, ib)
+            })
         })
     }
 }
@@ -600,3 +635,93 @@ where
     };
     if use_global { body() } else { pool.unwrap().install(body) }
 }
+
+#[cfg(test)]
+mod blocked_walk_tests {
+    use super::*;
+    use std::collections::HashSet;
+
+    fn collect(
+        m: usize,
+        n: usize,
+        blk: usize,
+        blk_outer: usize,
+        col_outer: bool,
+    ) -> Vec<(usize, usize)> {
+        let mut v = Vec::new();
+        for_each_blocked_tile(m, n, blk, blk_outer, col_outer, |ia, ib| {
+            v.push((ia, ib));
+            Ok(())
+        })
+        .unwrap();
+        v
+    }
+
+    /// Every grid tile is visited exactly once, for both inner orders and a
+    /// range of (blk, blk_outer) — single-tier (outer = MAX), two-tier, and
+    /// degenerate edges. Coverage being a permutation is what makes the walk
+    /// bit-exact with the naive loop.
+    #[test]
+    fn covers_every_tile_once() {
+        for &(m, n) in &[(1, 1), (3, 5), (16, 16), (40, 7), (7, 40), (80, 80)] {
+            for &blk in &[1, 3, 16] {
+                for &blk_outer in &[blk, blk + 1, 64, usize::MAX] {
+                    for &col_outer in &[false, true] {
+                        let tiles = collect(m, n, blk, blk_outer, col_outer);
+                        assert_eq!(tiles.len(), m * n, "m={m} n={n} blk={blk} outer={blk_outer}");
+                        let set: HashSet<_> = tiles.iter().copied().collect();
+                        assert_eq!(
+                            set.len(),
+                            m * n,
+                            "duplicate tiles m={m} n={n} blk={blk} outer={blk_outer}"
+                        );
+                        for ia in 0..m {
+                            for ib in 0..n {
+                                assert!(set.contains(&(ia, ib)), "missing ({ia},{ib})");
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /// With no outer tier (blk_outer = MAX) the two-tier walk must emit the exact
+    /// same order as the original single-level blocked loop — guarantees the L3
+    /// path is a pure no-op on hardware without a detectable L3.
+    #[test]
+    fn outer_max_matches_single_level() {
+        for &(m, n) in &[(40, 7), (80, 80), (13, 29)] {
+            for &blk in &[1, 4, 16] {
+                for &col_outer in &[false, true] {
+                    let two_tier = collect(m, n, blk, usize::MAX, col_outer);
+                    let mut single = Vec::new();
+                    let mut jb = 0;
+                    while jb < n {
+                        let jb_end = (jb + blk).min(n);
+                        let mut ja = 0;
+                        while ja < m {
+                            let ja_end = (ja + blk).min(m);
+                            if col_outer {
+                                for ib in jb..jb_end {
+                                    for ia in ja..ja_end {
+                                        single.push((ia, ib));
+                                    }
+                                }
+                            } else {
+                                for ia in ja..ja_end {
+                                    for ib in jb..jb_end {
+                                        single.push((ia, ib));
+                                    }
+                                }
+                            }
+                            ja = ja_end;
+                        }
+                        jb = jb_end;
+                    }
+                    assert_eq!(two_tier, single, "m={m} n={n} blk={blk} col_outer={col_outer}");
+                }
+            }
+        }
+    }
+}
diff --git a/linalg/src/lib.rs b/linalg/src/lib.rs
index f64488bd8e..40fe3f1f2a 100644
--- a/linalg/src/lib.rs
+++ b/linalg/src/lib.rs
@@ -20,6 +20,7 @@ include!(concat!(env!("OUT_DIR"), "/extern_kernel_macro.rs"));
 
 #[macro_use]
 mod frame;
+pub mod cache;
 pub mod generic;
 pub mod multithread;
 pub use frame::weights::WeightType;