diff --git a/linalg/src/cache.rs b/linalg/src/cache.rs new file mode 100644 index 0000000000..01a1d33925 --- /dev/null +++ b/linalg/src/cache.rs @@ -0,0 +1,378 @@ +//! Best-effort runtime CPU data-cache geometry detection. +//! +//! Cache blocking (panel-block sizing in `mmm`, im2col lowering thresholds, …) +//! is only correct when the block budget is derived from the *actual* cache the +//! code runs on, not a hard-coded constant. This module centralises that +//! detection so every heuristic reads the same memoised numbers instead of each +//! re-implementing a platform probe. +//! +//! All sizes are **bytes**, with `0` meaning "could not detect on this platform" +//! — callers must treat `0` as unknown and fall back conservatively (never +//! over-block a cache you cannot see). The raw fields stay honest; the +//! `*_or_default` helpers apply an architecture-based guess for callers that +//! prefer a number to a zero. +//! +//! Detection is done once, lazily, and cached for the process lifetime. + +use std::sync::OnceLock; + +/// Detected data-cache sizes in bytes. `0` == unknown on this platform. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +pub struct CacheInfo { + /// L1 data cache (per core), bytes. `0` if unknown. + pub l1_data: usize, + /// L2 cache (per perf-core / cluster), bytes. `0` if unknown. + pub l2: usize, + /// L3 / last-level cache, bytes. `0` if unknown. + pub l3: usize, +} + +impl CacheInfo { + /// L1 data cache, or an architecture-based guess when undetected + /// (64 KiB on arm64, 32 KiB elsewhere — matches common silicon). + pub fn l1_data_or_default(&self) -> usize { + if self.l1_data > 0 { + self.l1_data + } else if cfg!(target_arch = "aarch64") { + 64 * 1024 + } else { + 32 * 1024 + } + } + + /// L2 cache, or a conservative 256 KiB guess when undetected. + pub fn l2_or_default(&self) -> usize { + if self.l2 > 0 { self.l2 } else { 256 * 1024 } + } +} + +/// Memoised cache geometry for the current machine. Detected once on first call. +pub fn cache_info() -> CacheInfo { + static CACHE: OnceLock = OnceLock::new(); + *CACHE.get_or_init(detect) +} + +/// Where the last-level cache used for the outer GEMM blocking tier comes from, +/// which implies how aggressively a single thread may budget it. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum LlcKind { + /// Architectural cluster L3 (or an operator-provided size) — effectively + /// private to the CPU, so a single thread can assume most of it. + Dedicated, + /// System-Level Cache: an interconnect cache shared with the GPU/NPU/display + /// (e.g. Qualcomm LLCC, Apple SLC). Contended — budget it conservatively. + SystemLevel, +} + +/// Size (bytes) and kind of the last-level cache to size the outer GEMM blocking +/// tier against, or `None` when nothing usefully larger than L2 is known. +/// +/// Resolution order (first hit wins): +/// 1. `TRACT_LLC_BYTES` env override (e.g. `"8M"`, `"33554432"`) — for embedders +/// who know their SoC's LLC/SLC when the OS doesn't expose it. Marked +/// [`LlcKind::SystemLevel`] iff `TRACT_LLC_CONTENDED` is set, else `Dedicated`. +/// 2. architecturally-detected L3 ([`CacheInfo::l3`]) when it exceeds L2 — `Dedicated`. +/// 3. a System-Level Cache discovered via the Linux devicetree (`cache-level == 3` +/// with a `cache-size`, outside `/cpus`) — `SystemLevel`. +/// +/// The per-CPU `cpu/cache/index*` topology the L3 probe reads does **not** list an +/// SLC (it's a separate interconnect IP), which is why an SLC needs a distinct +/// source. Prior art — runtime cache sizing: Eigen `queryCacheSizes` (CPUID/sysctl), +/// glibc `sysconf(_SC_LEVELx_CACHE_SIZE)`, ACPI PPTT, hwloc. SLC exposure: Qualcomm +/// LLCC (`drivers/soc/qcom/llcc-qcom.c`, devicetree `qcom,llcc`) and the generic +/// devicetree cache bindings. +pub fn last_level_cache() -> Option<(usize, LlcKind)> { + let ci = cache_info(); + resolve_llc( + env_llc_override(), + std::env::var_os("TRACT_LLC_CONTENDED").is_some(), + ci.l2, + ci.l3, + system_level_cache_bytes(), + ) +} + +/// Pure resolution of [`last_level_cache`] (factored out so it is testable without +/// touching process-global env / hardware). +fn resolve_llc( + override_bytes: Option, + override_contended: bool, + l2: usize, + l3: usize, + slc: usize, +) -> Option<(usize, LlcKind)> { + if let Some(b) = override_bytes.filter(|b| *b > 0) { + let kind = if override_contended { LlcKind::SystemLevel } else { LlcKind::Dedicated }; + return Some((b, kind)); + } + if l3 > l2 { + return Some((l3, LlcKind::Dedicated)); + } + if slc > l2 && slc > 0 { + return Some((slc, LlcKind::SystemLevel)); + } + None +} + +fn env_llc_override() -> Option { + let b = parse_cache_size(&std::env::var("TRACT_LLC_BYTES").ok()?); + (b > 0).then_some(b) +} + +/// Best-effort System-Level Cache size (bytes) from the Linux devicetree: the +/// largest node carrying `cache-level == 3` *and* a `cache-size`, outside the +/// `/cpus` subtree (so it is an interconnect cache, not a CPU cache the L3 probe +/// already saw). Returns `0` when unavailable — e.g. SLCs whose size is fixed in +/// the controller (Qualcomm LLCC) carry no `cache-size` here, so those still rely +/// on the `TRACT_LLC_BYTES` override. +#[cfg(any(target_os = "linux", target_os = "android"))] +fn system_level_cache_bytes() -> usize { + use std::path::Path; + fn be_u32(p: &Path) -> Option { + let b = std::fs::read(p).ok()?; + (b.len() >= 4).then(|| u32::from_be_bytes([b[0], b[1], b[2], b[3]])) + } + fn walk(dir: &Path, depth: usize, best: &mut usize) { + if depth == 0 { + return; + } + if be_u32(&dir.join("cache-level")) == Some(3) { + let sz = be_u32(&dir.join("cache-size")).unwrap_or(0) as usize; + *best = (*best).max(sz); + } + let Ok(rd) = std::fs::read_dir(dir) else { return }; + for e in rd.flatten() { + let p = e.path(); + // CPU caches are handled by the architectural L3 probe; skip them. + if p.is_dir() && p.file_name().and_then(|n| n.to_str()) != Some("cpus") { + walk(&p, depth - 1, best); + } + } + } + let mut best = 0; + for root in ["/proc/device-tree", "/sys/firmware/devicetree/base"] { + let p = Path::new(root); + if p.exists() { + walk(p, 4, &mut best); + if best > 0 { + break; + } + } + } + best +} + +#[cfg(not(any(target_os = "linux", target_os = "android")))] +fn system_level_cache_bytes() -> usize { + 0 +} + +/// Parse a Linux `/sys` cache `size` string (e.g. `"256K"`, `"8M"`, `"512"`). +#[cfg_attr(not(any(target_os = "linux", target_os = "android")), allow(dead_code))] +fn parse_cache_size(s: &str) -> usize { + let s = s.trim(); + let (num, mult) = if let Some(n) = s.strip_suffix(['K', 'k']) { + (n, 1024) + } else if let Some(n) = s.strip_suffix(['M', 'm']) { + (n, 1024 * 1024) + } else { + (s, 1) + }; + num.trim().parse::().unwrap_or(0) * mult +} + +#[cfg(any(target_os = "macos", target_os = "ios"))] +fn detect() -> CacheInfo { + // Read a scalar `hw.*` sysctl by name via the libc FFI (no subprocess). + // macOS returns these as a little-endian integer (4 or 8 bytes); a zeroed + // 8-byte buffer reads either width correctly on little-endian Apple silicon + // and Intel. + fn sysctl_usize(name: &str) -> Option { + use std::ffi::CString; + use std::os::raw::{c_char, c_int, c_void}; + unsafe extern "C" { + fn sysctlbyname( + name: *const c_char, + oldp: *mut c_void, + oldlenp: *mut usize, + newp: *mut c_void, + newlen: usize, + ) -> c_int; + } + let cname = CString::new(name).ok()?; + let mut val: u64 = 0; + let mut len = std::mem::size_of::(); + let rc = unsafe { + sysctlbyname( + cname.as_ptr(), + &mut val as *mut u64 as *mut c_void, + &mut len, + std::ptr::null_mut(), + 0, + ) + }; + if rc != 0 || val == 0 { None } else { Some(val as usize) } + } + + CacheInfo { + // perflevel0 is the performance cluster on hybrid Apple Silicon. + l1_data: sysctl_usize("hw.perflevel0.l1dcachesize") + .or_else(|| sysctl_usize("hw.l1dcachesize")) + .unwrap_or(0), + l2: sysctl_usize("hw.perflevel0.l2cachesize") + .or_else(|| sysctl_usize("hw.l2cachesize")) + .unwrap_or(0), + l3: sysctl_usize("hw.perflevel0.l3cachesize") + .or_else(|| sysctl_usize("hw.l3cachesize")) + .unwrap_or(0), + } +} + +#[cfg(any(target_os = "linux", target_os = "android"))] +fn detect() -> CacheInfo { + // Walk /sys/.../cache/indexN, keying off the reported level+type rather than + // assuming a fixed index layout (it varies: SMT, unified vs split L2, …). + let mut ci = CacheInfo::default(); + for idx in 0..16 { + let base = format!("/sys/devices/system/cpu/cpu0/cache/index{idx}/"); + let Ok(level) = std::fs::read_to_string(format!("{base}level")) else { + continue; + }; + let level: usize = level.trim().parse().unwrap_or(0); + let ctype = std::fs::read_to_string(format!("{base}type")) + .unwrap_or_default() + .trim() + .to_ascii_lowercase(); + let size = std::fs::read_to_string(format!("{base}size")) + .map(|s| parse_cache_size(&s)) + .unwrap_or(0); + if size == 0 { + continue; + } + match level { + 1 if ctype == "data" || ctype == "unified" => { + if ci.l1_data == 0 { + ci.l1_data = size; + } + } + 2 if ci.l2 == 0 => ci.l2 = size, + 3 if ci.l3 == 0 => ci.l3 = size, + _ => {} + } + } + ci +} + +#[cfg(target_os = "windows")] +fn detect() -> CacheInfo { + // wmic only reports L2/L3 (in KiB) and is deprecated on Win11; it is the + // dependency-free option. L1 is left unknown (→ l1_data_or_default). + // A future GetLogicalProcessorInformationEx probe would also yield L1. + let mut ci = CacheInfo::default(); + if let Ok(out) = std::process::Command::new("wmic") + .args(["cpu", "get", "L2CacheSize,L3CacheSize", "/format:value"]) + .output() + { + for line in String::from_utf8_lossy(&out.stdout).lines() { + let line = line.trim(); + if let Some(v) = line.strip_prefix("L2CacheSize=") { + if let Ok(kb) = v.trim().parse::() { + ci.l2 = kb * 1024; + } + } else if let Some(v) = line.strip_prefix("L3CacheSize=") { + if let Ok(kb) = v.trim().parse::() { + ci.l3 = kb * 1024; + } + } + } + } + ci +} + +#[cfg(not(any( + target_os = "macos", + target_os = "ios", + target_os = "linux", + target_os = "android", + target_os = "windows" +)))] +fn detect() -> CacheInfo { + // WASM, BSDs, etc.: no portable probe — report unknown, callers fall back. + CacheInfo::default() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn llc_resolution_priority() { + // override wins over everything; contended flag selects the kind. + assert_eq!( + resolve_llc(Some(8 << 20), false, 1 << 20, 4 << 20, 0), + Some((8 << 20, LlcKind::Dedicated)) + ); + assert_eq!( + resolve_llc(Some(8 << 20), true, 1 << 20, 0, 0), + Some((8 << 20, LlcKind::SystemLevel)) + ); + // no override: architectural L3 (> L2) is Dedicated. + assert_eq!( + resolve_llc(None, false, 1 << 20, 4 << 20, 0), + Some((4 << 20, LlcKind::Dedicated)) + ); + // no L3, but an SLC > L2 is reported: SystemLevel (contended). + assert_eq!( + resolve_llc(None, false, 512 << 10, 0, 4 << 20), + Some((4 << 20, LlcKind::SystemLevel)) + ); + // nothing larger than L2 known ⇒ no outer tier (regression-safe). + assert_eq!(resolve_llc(None, false, 1 << 20, 0, 0), None); + assert_eq!(resolve_llc(None, false, 1 << 20, 1 << 20, 512 << 10), None); + // a zero/garbage override is ignored, falling through to detection. + assert_eq!( + resolve_llc(Some(0), false, 1 << 20, 4 << 20, 0), + Some((4 << 20, LlcKind::Dedicated)) + ); + } + + #[test] + fn slc_probe_never_panics() { + // On the test host this is typically 0 (no devicetree SLC); just exercise it. + let _ = system_level_cache_bytes(); + let _ = last_level_cache(); + } + + #[test] + fn parse_cache_size_units() { + assert_eq!(parse_cache_size("512"), 512); + assert_eq!(parse_cache_size("256K"), 256 * 1024); + assert_eq!(parse_cache_size("8M"), 8 * 1024 * 1024); + assert_eq!(parse_cache_size(" 1024k "), 1024 * 1024); + assert_eq!(parse_cache_size("garbage"), 0); + } + + #[test] + fn defaults_are_nonzero() { + let unknown = CacheInfo::default(); + assert!(unknown.l1_data_or_default() >= 32 * 1024); + assert_eq!(unknown.l2_or_default(), 256 * 1024); + } + + #[test] + fn detected_values_are_sane_when_present() { + // Detection must never panic and must be self-consistent: any level it + // *does* report should be a plausible power-of-two-ish cache size, and + // L1 <= L2 <= L3 when all are known. + let ci = cache_info(); + for (name, v) in [("l1d", ci.l1_data), ("l2", ci.l2), ("l3", ci.l3)] { + assert!(v == 0 || (1024..=512 * 1024 * 1024).contains(&v), "{name} implausible: {v}"); + } + if ci.l1_data > 0 && ci.l2 > 0 { + assert!(ci.l1_data <= ci.l2, "L1 {} > L2 {}", ci.l1_data, ci.l2); + } + if ci.l2 > 0 && ci.l3 > 0 { + assert!(ci.l2 <= ci.l3, "L2 {} > L3 {}", ci.l2, ci.l3); + } + } +} diff --git a/linalg/src/frame/mmm/mod.rs b/linalg/src/frame/mmm/mod.rs index bafc69826e..89ac9e9431 100644 --- a/linalg/src/frame/mmm/mod.rs +++ b/linalg/src/frame/mmm/mod.rs @@ -279,122 +279,127 @@ unsafe fn run_with_scratch_space_vec( } } -/// Upper bound on the single-thread panel-block edge (matches the multithread -/// `chunk_grid` default). +/// Upper bound on the inner (L2-resident) panel-block edge (matches the +/// multithread `chunk_grid` default). const ST_BLK_MAX: usize = 16; -#[cfg(target_os = "linux")] -fn parse_cache_size(s: &str) -> usize { - let s = s.trim(); - let (num, mult) = if let Some(n) = s.strip_suffix(['K', 'k']) { - (n, 1024) - } else if let Some(n) = s.strip_suffix(['M', 'm']) { - (n, 1024 * 1024) +/// Upper bound on the outer (L3-resident) super-block edge. 4× the inner cap so +/// an L3 several times larger than L2 can hold a meaningfully bigger super-block. +const ST_BLK_L3_MAX: usize = 64; + +/// Panel-block working-set budget (bytes) from a detected cache size: a fraction +/// `num/den` of the cache (leaving room for the C accumulator tile + packing +/// metadata), clamped to a sane range. `0` (cache unknown) ⇒ `fallback`, which +/// is kept small so the block ≈ the naive loop and can never over-block a cache +/// it can't see. Sizes come from the shared [`crate::cache`] probe. +fn tier_budget_bytes(cache_bytes: usize, num: usize, den: usize, fallback: usize) -> usize { + if cache_bytes == 0 { + fallback } else { - (s, 1) - }; - num.trim().parse::().unwrap_or(0) * mult + (cache_bytes * num / den).clamp(64 * 1024, 64 * 1024 * 1024) + } } -/// Best-effort L2 data-cache size in bytes (per perf-core / cluster); 0 if -/// unknown. Cached. Used to size the single-thread cache-block budget so it is -/// correct across hardware instead of a hard-coded constant. -fn detect_l2_bytes() -> usize { - static L2: std::sync::OnceLock = std::sync::OnceLock::new(); - *L2.get_or_init(|| { - #[cfg(target_os = "macos")] - { - let sysctl = |k: &str| -> Option { - let o = std::process::Command::new("sysctl").arg("-n").arg(k).output().ok()?; - if !o.status.success() { - return None; - } - String::from_utf8_lossy(&o.stdout).trim().parse().ok() - }; - // Prefer the performance-core L2 on hybrid Apple Silicon. - sysctl("hw.perflevel0.l2cachesize").or_else(|| sysctl("hw.l2cachesize")).unwrap_or(0) - } - #[cfg(target_os = "linux")] - { - // index2/index3 is typically the unified L2 (index0/1 are L1 d/i). - for idx in [2usize, 3] { - if let Ok(s) = std::fs::read_to_string(format!( - "/sys/devices/system/cpu/cpu0/cache/index{idx}/size" - )) { - let b = parse_cache_size(s.trim()); - if b > 0 { - return b; - } - } - } - 0 - } - #[cfg(not(any(target_os = "macos", target_os = "linux")))] - { - 0 - } - }) +/// Inner tier: ~a third of L2 (private per perf-core), 256 KiB fallback. +fn l2_block_budget_bytes() -> usize { + tier_budget_bytes(crate::cache::cache_info().l2, 1, 3, 256 * 1024) } -/// Working-set budget (bytes) for the single-thread cache-block: ~a third of L2 -/// (leaving room for the C accumulator tile + packing metadata). Conservative -/// 256 KiB fallback when L2 is unknown (WASM/Windows/BSD) ⇒ small blocks ≈ the -/// naive loop, so it can never over-block a cache it can't see. -fn block_budget_bytes() -> usize { - let l2 = detect_l2_bytes(); - if l2 == 0 { 256 * 1024 } else { (l2 / 3).clamp(64 * 1024, 8 * 1024 * 1024) } +/// Outer tier: ~half of L3, but only when an L3 is detected and is meaningfully +/// larger than L2 (otherwise an outer tier just duplicates the inner one). +/// `None` ⇒ no outer tier; the walk stays single-level (identical to before). +fn l3_block_budget_bytes() -> Option { + use crate::cache::LlcKind; + let (bytes, kind) = crate::cache::last_level_cache()?; + // Dedicated cluster L3: ~half. A shared System-Level Cache is contended by the + // GPU/NPU/display, so we can't assume residency of lines they keep evicting — + // budget it to ~a quarter. + let (num, den) = match kind { + LlcKind::Dedicated => (1, 2), + LlcKind::SystemLevel => (1, 4), + }; + Some(tier_budget_bytes(bytes, num, den, 0)) } -/// Cache-adaptive panel-block edge: large enough to amortise streaming, small -/// enough that the block's A+B sub-panels (`~blk·(mr+nr)·k·elem_bytes`) stay -/// L2-resident at the given `k`. Capped at [`ST_BLK_MAX`]; the floor of 1 -/// degrades exactly to the naive loop, so an unknown/small cache can never -/// over-block (regression-safe). The budget is **cache-size derived** (not a -/// hard-coded constant), so it is correct across hardware. +/// Cache-adaptive panel-block edge for a given byte budget: large enough to +/// amortise streaming, small enough that the block's A+B sub-panels +/// (`~blk·(mr+nr)·k·elem_bytes`) stay cache-resident at the given `k`. Capped at +/// `cap`; the floor of 1 degrades exactly to the naive loop, so an unknown/small +/// cache can never over-block (regression-safe). #[inline] -fn st_block_edge(mr: usize, nr: usize, k: usize, elem_bytes: usize) -> usize { +fn block_edge_for( + budget: usize, + mr: usize, + nr: usize, + k: usize, + elem_bytes: usize, + cap: usize, +) -> usize { if k == 0 { - return ST_BLK_MAX; + return cap; } let per_blk = ((mr + nr) * k * elem_bytes.max(1)).max(1); - (block_budget_bytes() / per_blk).clamp(1, ST_BLK_MAX) + (budget / per_blk).clamp(1, cap) } -/// Single-thread tile walk over the `m_panels × n_panels` grid, blocked into -/// cache-sized panel blocks for locality (the naive nested loop re-streams the -/// whole inner operand per outer panel at large k; the multithread path already -/// blocks this way via `chunk_grid`). `col_outer` selects the within-block inner -/// order (B-reuse vs A-reuse). Reordering independent tiles changes no result — -/// bit-exact with the naive loop. +/// Inner (L2) panel-block edge. Budget is **cache-size derived** (not a +/// hard-coded constant), so it is correct across hardware. #[inline] -unsafe fn run_single_thread_blocked( - ker: &K, +fn st_block_edge(mr: usize, nr: usize, k: usize, elem_bytes: usize) -> usize { + block_edge_for(l2_block_budget_bytes(), mr, nr, k, elem_bytes, ST_BLK_MAX) +} + +/// Outer (L3) super-block edge, or `usize::MAX` (one block over the whole grid, +/// i.e. no outer tier) when no usable L3 is detected. Never smaller than the +/// inner edge `inner`. +#[inline] +fn st_outer_block_edge(mr: usize, nr: usize, k: usize, elem_bytes: usize, inner: usize) -> usize { + match l3_block_budget_bytes() { + Some(budget) => block_edge_for(budget, mr, nr, k, elem_bytes, ST_BLK_L3_MAX).max(inner), + None => usize::MAX, + } +} + +/// Visit every `(ia, ib)` tile of the `m_panels × n_panels` grid exactly once, +/// blocked two levels deep: an outer `blk_outer` super-block (L3-resident) holds +/// inner `blk` blocks (L2-resident). `col_outer` selects the within-block inner +/// order (B-reuse vs A-reuse). When `blk_outer >= max(m,n)` the outer loop runs +/// once and this is exactly the single-level inner walk. Pure tile reordering ⇒ +/// no result changes; extracted so the nesting can be unit-tested independently +/// of the kernel. +#[inline] +fn for_each_blocked_tile( m_panels: usize, n_panels: usize, - k: usize, + blk: usize, + blk_outer: usize, col_outer: bool, - scratch: &mut ScratchSpaceImpl, - non_linear: &[FusedSpec], + mut f: impl FnMut(usize, usize) -> TractResult<()>, ) -> TractResult<()> { - unsafe { - let blk = st_block_edge(ker.mr(), ker.nr(), k, K::Acc::datum_type().size_of()); - scratch.run_in_tls_scope(|scratch, tls| { - let mut jb = 0; - while jb < n_panels { - let jb_end = (jb + blk).min(n_panels); - let mut ja = 0; - while ja < m_panels { - let ja_end = (ja + blk).min(m_panels); + let blk = blk.max(1); + let blk_outer = blk_outer.max(blk); + let mut jb3 = 0; + while jb3 < n_panels { + let jb3_end = jb3.saturating_add(blk_outer).min(n_panels); + let mut ja3 = 0; + while ja3 < m_panels { + let ja3_end = ja3.saturating_add(blk_outer).min(m_panels); + let mut jb = jb3; + while jb < jb3_end { + let jb_end = (jb + blk).min(jb3_end); + let mut ja = ja3; + while ja < ja3_end { + let ja_end = (ja + blk).min(ja3_end); if col_outer { for ib in jb..jb_end { for ia in ja..ja_end { - scratch.run_one_tile(ker, non_linear, tls, ia, ib)?; + f(ia, ib)?; } } } else { for ia in ja..ja_end { for ib in jb..jb_end { - scratch.run_one_tile(ker, non_linear, tls, ia, ib)?; + f(ia, ib)?; } } } @@ -402,7 +407,37 @@ unsafe fn run_single_thread_blocked( } jb = jb_end; } - TractResult::Ok(()) + ja3 = ja3_end; + } + jb3 = jb3_end; + } + Ok(()) +} + +/// Single-thread tile walk over the `m_panels × n_panels` grid, blocked into +/// cache-sized panel blocks for locality (the naive nested loop re-streams the +/// whole inner operand per outer panel at large k; the multithread path already +/// blocks this way via `chunk_grid`). Two tiers: an inner L2-resident block and, +/// where an L3 is detected, an outer L3-resident super-block. Reordering +/// independent tiles changes no result — bit-exact with the naive loop. +#[inline] +unsafe fn run_single_thread_blocked( + ker: &K, + m_panels: usize, + n_panels: usize, + k: usize, + col_outer: bool, + scratch: &mut ScratchSpaceImpl, + non_linear: &[FusedSpec], +) -> TractResult<()> { + unsafe { + let elem = K::Acc::datum_type().size_of(); + let blk = st_block_edge(ker.mr(), ker.nr(), k, elem); + let blk_outer = st_outer_block_edge(ker.mr(), ker.nr(), k, elem, blk); + scratch.run_in_tls_scope(|scratch, tls| { + for_each_blocked_tile(m_panels, n_panels, blk, blk_outer, col_outer, |ia, ib| { + scratch.run_one_tile(ker, non_linear, tls, ia, ib) + }) }) } } @@ -600,3 +635,93 @@ where }; if use_global { body() } else { pool.unwrap().install(body) } } + +#[cfg(test)] +mod blocked_walk_tests { + use super::*; + use std::collections::HashSet; + + fn collect( + m: usize, + n: usize, + blk: usize, + blk_outer: usize, + col_outer: bool, + ) -> Vec<(usize, usize)> { + let mut v = Vec::new(); + for_each_blocked_tile(m, n, blk, blk_outer, col_outer, |ia, ib| { + v.push((ia, ib)); + Ok(()) + }) + .unwrap(); + v + } + + /// Every grid tile is visited exactly once, for both inner orders and a + /// range of (blk, blk_outer) — single-tier (outer = MAX), two-tier, and + /// degenerate edges. Coverage being a permutation is what makes the walk + /// bit-exact with the naive loop. + #[test] + fn covers_every_tile_once() { + for &(m, n) in &[(1, 1), (3, 5), (16, 16), (40, 7), (7, 40), (80, 80)] { + for &blk in &[1, 3, 16] { + for &blk_outer in &[blk, blk + 1, 64, usize::MAX] { + for &col_outer in &[false, true] { + let tiles = collect(m, n, blk, blk_outer, col_outer); + assert_eq!(tiles.len(), m * n, "m={m} n={n} blk={blk} outer={blk_outer}"); + let set: HashSet<_> = tiles.iter().copied().collect(); + assert_eq!( + set.len(), + m * n, + "duplicate tiles m={m} n={n} blk={blk} outer={blk_outer}" + ); + for ia in 0..m { + for ib in 0..n { + assert!(set.contains(&(ia, ib)), "missing ({ia},{ib})"); + } + } + } + } + } + } + } + + /// With no outer tier (blk_outer = MAX) the two-tier walk must emit the exact + /// same order as the original single-level blocked loop — guarantees the L3 + /// path is a pure no-op on hardware without a detectable L3. + #[test] + fn outer_max_matches_single_level() { + for &(m, n) in &[(40, 7), (80, 80), (13, 29)] { + for &blk in &[1, 4, 16] { + for &col_outer in &[false, true] { + let two_tier = collect(m, n, blk, usize::MAX, col_outer); + let mut single = Vec::new(); + let mut jb = 0; + while jb < n { + let jb_end = (jb + blk).min(n); + let mut ja = 0; + while ja < m { + let ja_end = (ja + blk).min(m); + if col_outer { + for ib in jb..jb_end { + for ia in ja..ja_end { + single.push((ia, ib)); + } + } + } else { + for ia in ja..ja_end { + for ib in jb..jb_end { + single.push((ia, ib)); + } + } + } + ja = ja_end; + } + jb = jb_end; + } + assert_eq!(two_tier, single, "m={m} n={n} blk={blk} col_outer={col_outer}"); + } + } + } + } +} diff --git a/linalg/src/lib.rs b/linalg/src/lib.rs index f64488bd8e..40fe3f1f2a 100644 --- a/linalg/src/lib.rs +++ b/linalg/src/lib.rs @@ -20,6 +20,7 @@ include!(concat!(env!("OUT_DIR"), "/extern_kernel_macro.rs")); #[macro_use] mod frame; +pub mod cache; pub mod generic; pub mod multithread; pub use frame::weights::WeightType;