From b4cbf8970fb76479df3b4538f2cfce2a6efc36b4 Mon Sep 17 00:00:00 2001 From: StrongWind <5987034+StrongWind1@users.noreply.github.com> Date: Thu, 28 May 2026 00:01:32 -0400 Subject: [PATCH 01/14] feat: disk-backed fallback for MessageStore and PmkidStore Add automatic memory pressure detection via MemMonitor (80% RSS threshold). When triggered, MessageStore and PmkidStore flush their in-memory data to temp files and switch to disk-backed mode for the remainder of the run. MessageStore disk mode: - Binary serialization format for EapolMessage (99-byte fixed header + optional 57-byte FtFields + variable eapol_frame) - flush_to_disk() serializes all groups, replaces Vec with Vec (8 bytes per message vs ~228) - New messages route directly to disk via add_to_disk() - group_keys() + load_group() for lazy Phase 4 iteration - canonicalize_pairs() rewrites index keys without loading data PmkidStore disk mode: - Same pattern: binary serialization, flush, disk-backed add/iter - iter() returns owned PmkidEntry values (Box) Pairing engine (pair/mod.rs): - Disk mode: single-threaded iteration via group_keys() + load_group() - Memory mode: unchanged rayon parallel path - estimate_total_cost() returns 0 in disk mode (skip dedup pre-sizing) Output pipeline: - PerSinkDedup::reserve() now takes active_sinks mask, only pre-sizes HashSets for configured sinks (fixes 9x over-allocation bug) Main integration: - MemMonitor checks every 50K packets + every file transition - Flush both stores when disk_mode activates - Cleanup temp files on shutdown --- src/lib.rs | 1 + src/main.rs | 48 +++-- src/mem_monitor.rs | 189 +++++++++++++++++ src/output/dedup.rs | 17 +- src/output/mod.rs | 44 ++-- src/pair/mod.rs | 59 +++++- src/store/disk_messages.rs | 420 +++++++++++++++++++++++++++++++++++++ src/store/messages.rs | 224 ++++++++++++++++++-- src/store/mod.rs | 1 + src/store/pmkid.rs | 149 ++++++++++++- src/types.rs | 82 ++++++++ 11 files changed, 1164 insertions(+), 70 deletions(-) create mode 100644 src/mem_monitor.rs create mode 100644 src/store/disk_messages.rs diff --git a/src/lib.rs b/src/lib.rs index fed06d3..9e9194a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,6 +16,7 @@ pub mod ieee80211; pub mod input; pub mod link; pub mod log; +pub mod mem_monitor; pub mod mem_stats; pub mod output; pub mod pair; diff --git a/src/main.rs b/src/main.rs index e20a50a..9f6b808 100644 --- a/src/main.rs +++ b/src/main.rs @@ -26,6 +26,7 @@ use wpawolf::{ ieee80211::frame, input, link, log::Logger, + mem_monitor::MemMonitor, output::{EssidFilterConfig, OutputPaths, dedup::SinkId}, pair::combos::PairConfig, progress::ProgressReporter, @@ -446,8 +447,7 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { ))); } - // OOM guard: abort if RSS exceeds 80% of system RAM. - let oom_threshold_bytes = wpawolf::progress::total_ram_bytes() * 80 / 100; + let mut mem_monitor = MemMonitor::new(); // --- Phase 2 + 3 setup (moved up so per-file mode can emit inside the loop) --- let pair_config = PairConfig { @@ -548,10 +548,22 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { stats.total_packets += 1; frame_in_file += 1; logger.set_frame(frame_in_file); - // Periodic stderr progress line (no-op when --quiet). Cheap on the - // hot path: most calls return after a single u64 comparison. let eapol_total = stats.eapol_m1 + stats.eapol_m2 + stats.eapol_m3 + stats.eapol_m4; progress.tick(stats.total_packets, stats.input_file_count, eapol_total, stats.pmkids_found); + if mem_monitor.tick_packet() { + if !message_store.disk_mode() + && let Err(e) = message_store.flush_to_disk() + { + println!("error: failed to flush MessageStore to disk: {e}"); + std::process::exit(1); + } + if !pmkid_store.disk_mode() + && let Err(e) = pmkid_store.flush_to_disk() + { + println!("error: failed to flush PmkidStore to disk: {e}"); + std::process::exit(1); + } + } // Timestamp range (epoch microseconds). Initialise first_us on the very first packet. if stats.timestamp_first_us == 0 && packet.timestamp_us > 0 { stats.timestamp_first_us = packet.timestamp_us; @@ -710,16 +722,17 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { ); let _ = debug.memory_check(&format!("Phase 1 file {}/{total_inputs}", file_idx + 1)); - // OOM guard: every 1000 files, check RSS and abort if approaching OOM. - if (file_idx + 1) % 1000 == 0 { - let rss = wpawolf::progress::current_rss_bytes(); - if rss > oom_threshold_bytes { - let rss_mib = rss / (1024 * 1024); - let total_mib = wpawolf::progress::total_ram_bytes() / (1024 * 1024); - println!( - "error: approaching OOM -- RSS {rss_mib} MiB / {total_mib} MiB (>= 80%) during Phase 1 ingestion (file {}/{total_inputs}). Reduce input size, use --per-file, or increase available RAM.", - file_idx + 1 - ); + if mem_monitor.check() { + if !message_store.disk_mode() + && let Err(e) = message_store.flush_to_disk() + { + println!("error: failed to flush MessageStore to disk: {e}"); + std::process::exit(1); + } + if !pmkid_store.disk_mode() + && let Err(e) = pmkid_store.flush_to_disk() + { + println!("error: failed to flush PmkidStore to disk: {e}"); std::process::exit(1); } } @@ -864,9 +877,10 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { ); let _ = debug.memory_check("Phase 1 complete"); - if debug.enabled { + if debug.enabled && !message_store.disk_mode() { // Build group summaries for the top-25 survey and cost-tier breakdown. - // Both come from the same single pass over the store. + // Both come from the same single pass over the store. Skipped in disk + // mode to avoid loading all groups back into memory. let mut summaries: Vec = message_store .groups() .map(|(pair, msgs)| GroupSummary::from_messages(pair.ap, pair.sta, msgs)) @@ -986,6 +1000,8 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { } logger.flush()?; + message_store.cleanup_disk(); + pmkid_store.cleanup_disk(); stats.fragment_stats.fragments_incomplete = u64::try_from(fragment_store.len()).unwrap_or(u64::MAX); stats.print_summary(); diff --git a/src/mem_monitor.rs b/src/mem_monitor.rs new file mode 100644 index 0000000..57b7739 --- /dev/null +++ b/src/mem_monitor.rs @@ -0,0 +1,189 @@ +//! Memory pressure monitor for automatic disk-backed fallback. +//! +//! Tracks process RSS relative to total system RAM via `sysinfo`. When RSS +//! reaches 80% of total RAM, sets a sticky `disk_mode` flag that tells the +//! pipeline to spill heavy stores to disk instead of growing unboundedly in +//! memory. The flag is one-way: once set, it stays set for the remainder of +//! the run. +//! +//! Check points: +//! - Phase 1: every file transition + every `CHECK_INTERVAL` packets +//! - Phase 4: before `dedup.reserve()`, every `EMIT_CHECK_INTERVAL` hash lines +//! +//! Uses process RSS (not system `available_memory`) to avoid premature triggers +//! from the kernel page cache filling during sequential pcap reads. + +use crate::progress; + +/// Packets between memory checks during Phase 1. +const CHECK_INTERVAL: u64 = 50_000; + +/// Hash lines between memory checks during Phase 4 output. +pub const EMIT_CHECK_INTERVAL: u64 = 100_000; + +/// RAM usage threshold (tenths of a percent). 800 = 80.0%. +const THRESHOLD_TENTHS: u64 = 800; + +/// Memory pressure monitor with sticky disk-mode flag. +pub struct MemMonitor { + total_ram: u64, + threshold_bytes: u64, + last_rss: u64, + disk_mode: bool, + packets_since_check: u64, +} + +impl MemMonitor { + /// Creates a new monitor. Probes total system RAM once at init. + #[must_use] + pub fn new() -> Self { + let total_ram = progress::total_ram_bytes(); + let threshold_bytes = total_ram / 1000 * THRESHOLD_TENTHS; + Self { total_ram, threshold_bytes, last_rss: 0, disk_mode: false, packets_since_check: 0 } + } + + /// Probes current RSS and activates disk mode if over threshold. + /// Returns `true` if disk mode just activated (first crossing). + pub fn check(&mut self) -> bool { + self.packets_since_check = 0; + let rss = progress::current_rss_bytes(); + self.last_rss = rss; + if !self.disk_mode && rss >= self.threshold_bytes { + self.disk_mode = true; + let rss_mib = rss / (1024 * 1024); + let total_mib = self.total_ram / (1024 * 1024); + eprintln!( + "wpawolf: memory pressure ({rss_mib} MiB / {total_mib} MiB, >= 80%) -- switching to disk-backed mode" + ); + return true; + } + false + } + + /// Increments the packet counter and checks memory if the interval has elapsed. + /// Returns `true` if disk mode just activated. + pub fn tick_packet(&mut self) -> bool { + self.packets_since_check += 1; + if self.packets_since_check >= CHECK_INTERVAL { + return self.check(); + } + false + } + + /// Predicts whether allocating `additional_bytes` would exceed the threshold. + /// Does NOT activate disk mode — the caller decides what to do. + #[must_use] + pub fn would_exceed(&mut self, additional_bytes: u64) -> bool { + let rss = progress::current_rss_bytes(); + self.last_rss = rss; + rss.saturating_add(additional_bytes) >= self.threshold_bytes + } + + /// Forces disk mode on. Used when `would_exceed` returns true and the caller + /// decides to skip a large allocation. + pub fn force_disk_mode(&mut self) { + if !self.disk_mode { + self.disk_mode = true; + let rss_mib = self.last_rss / (1024 * 1024); + let total_mib = self.total_ram / (1024 * 1024); + eprintln!( + "wpawolf: preemptive disk mode ({rss_mib} MiB / {total_mib} MiB) -- large allocation would exceed 80%" + ); + } + } + + /// Returns `true` if disk mode is active (sticky). + #[must_use] + pub const fn disk_mode(&self) -> bool { + self.disk_mode + } + + /// Total system RAM in bytes. + #[must_use] + pub const fn total_ram(&self) -> u64 { + self.total_ram + } + + /// Last observed RSS in bytes. + #[must_use] + pub const fn last_rss(&self) -> u64 { + self.last_rss + } +} + +impl Default for MemMonitor { + fn default() -> Self { + Self::new() + } +} + +impl std::fmt::Debug for MemMonitor { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MemMonitor") + .field("total_ram", &self.total_ram) + .field("disk_mode", &self.disk_mode) + .field("last_rss_mib", &(self.last_rss / (1024 * 1024))) + .field("threshold_mib", &(self.threshold_bytes / (1024 * 1024))) + .field("packets_since_check", &self.packets_since_check) + .finish() + } +} + +// --- Unit tests --- + +#[cfg(test)] +mod tests { + #![allow(clippy::unwrap_used, missing_docs, reason = "test module")] + + use super::*; + + #[test] + fn new_does_not_panic() { + let m = MemMonitor::new(); + assert!(!m.disk_mode()); + assert!(m.total_ram() > 0); + } + + #[test] + fn check_returns_false_when_under_threshold() { + let mut m = MemMonitor::new(); + let activated = m.check(); + // On a machine with reasonable RAM, the test process itself is well under 80%. + assert!(!activated); + assert!(!m.disk_mode()); + } + + #[test] + fn tick_packet_checks_at_interval() { + let mut m = MemMonitor::new(); + for _ in 0..CHECK_INTERVAL - 1 { + assert!(!m.tick_packet()); + } + // The CHECK_INTERVAL-th tick triggers a check. + let _ = m.tick_packet(); + assert_eq!(m.packets_since_check, 0, "counter should reset after check"); + } + + #[test] + fn force_disk_mode_sets_flag() { + let mut m = MemMonitor::new(); + assert!(!m.disk_mode()); + m.force_disk_mode(); + assert!(m.disk_mode()); + } + + #[test] + fn would_exceed_with_absurd_value() { + let mut m = MemMonitor::new(); + assert!(m.would_exceed(u64::MAX / 2)); + } + + #[test] + fn disk_mode_is_sticky() { + let mut m = MemMonitor::new(); + m.force_disk_mode(); + assert!(m.disk_mode()); + m.check(); // check again — should stay true + assert!(m.disk_mode()); + } +} diff --git a/src/output/dedup.rs b/src/output/dedup.rs index 15e05d5..7e2d32e 100644 --- a/src/output/dedup.rs +++ b/src/output/dedup.rs @@ -153,13 +153,16 @@ impl PerSinkDedup { Self::default() } - /// Pre-sizes every per-sink `HashSet` to hold at least `capacity` entries - /// without reallocating. Eliminates the transient memory spike from - /// hashbrown's power-of-2 resize doubling, where both old and new tables - /// are alive simultaneously during the copy. - pub fn reserve(&mut self, capacity: usize) { - for set in &mut self.sets { - set.reserve(capacity); + /// Pre-sizes per-sink `HashSet`s to hold at least `capacity` entries + /// without reallocating. Only reserves for sinks whose index appears in + /// `active_sinks`. Eliminates the transient memory spike from hashbrown's + /// power-of-2 resize doubling, where both old and new tables are alive + /// simultaneously during the copy. + pub fn reserve(&mut self, capacity: usize, active_sinks: &[bool; SinkId::COUNT]) { + for (idx, set) in self.sets.iter_mut().enumerate() { + if active_sinks.get(idx).copied().unwrap_or(false) { + set.reserve(capacity); + } } } diff --git a/src/output/mod.rs b/src/output/mod.rs index 46c982a..3f703b1 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -266,6 +266,17 @@ impl HashSinks { self.sinks.iter().any(Option::is_some) } + /// Returns a boolean mask of which sinks have a configured path. + fn active_mask(&self) -> [bool; SinkId::COUNT] { + let mut mask = [false; SinkId::COUNT]; + for (i, s) in self.sinks.iter().enumerate() { + if let Some(m) = mask.get_mut(i) { + *m = s.is_some(); + } + } + mask + } + /// Flushes every sink whose file has actually been created. fn flush_all(&mut self) -> Result<()> { for s in self.sinks.iter_mut().flatten() { @@ -541,12 +552,11 @@ impl OutputContext { let mut batch: HashMap = HashMap::new(); message_store.fold_timestamp_range_into(&wanted, &mut batch); for entry in pmkid_store.iter() { - if !wanted.contains(&entry.ap) { - continue; + if wanted.contains(&entry.ap) { + let r = batch.entry(entry.ap).or_insert((u64::MAX, 0)); + r.0 = r.0.min(entry.timestamp); + r.1 = r.1.max(entry.timestamp); } - let r = batch.entry(entry.ap).or_insert((u64::MAX, 0)); - r.0 = r.0.min(entry.timestamp); - r.1 = r.1.max(entry.timestamp); } // Merge this batch's ranges into the accumulator with min/max. for (ap, (first, last)) in batch { @@ -578,6 +588,8 @@ impl OutputContext { // Pre-size the dedup sets so hashbrown never resizes mid-run. Without // this, a resize from 2^31 to 2^32 slots requires both tables to be // alive simultaneously (54 GiB transient spike at ~2B entries). + // Only reserve for sinks that have a configured output path (fixes the + // 9x over-allocation when only 1-2 sinks are active). let estimated_pairs = crate::pair::estimate_total_cost(message_store); let estimated_hashes = estimated_pairs.saturating_add(pmkid_store.total_count() as u64); if estimated_hashes > 0 { @@ -586,7 +598,8 @@ impl OutputContext { reason = "saturating to usize::MAX is safe -- HashSet clamps internally" )] let cap = usize::try_from(estimated_hashes).unwrap_or(usize::MAX); - self.dedup.reserve(cap); + let active = self.sinks.active_mask(); + self.dedup.reserve(cap, &active); } self.emit_inner( @@ -628,13 +641,6 @@ impl OutputContext { // because the PMK is derived from PSK+SSID -- a different SSID is a different PMK. if any_sink { for entry in pmkid_store.iter() { - // Per-source extractors may store entry.akm = Unknown when the - // extraction site has no AKM context (e.g. AMPE element in a Mesh - // Peering action frame, OSEN IE in an Association Request). When - // the BSS still advertises a PSK AKM in its Beacon, fall back on - // akm_map.get_best so the PMKID is still crackable. Without this - // fallback the PMKID parses successfully and counts in stats but - // never emits a hashcat line -- silent loss for the operator. let resolved_akm = if matches!(entry.akm, AkmType::Unknown) || HashType::from_akm_and_attack(entry.akm, true).is_none() { @@ -647,23 +653,15 @@ impl OutputContext { let ssids = essid_map.ssids_for_emit(&entry.ap, essid_filter.collapse_min, essid_filter.collapse_ratio); let is_ft = ht.is_ft(); - // For FT-PSK PMKIDs, only write when we have complete FT context (R0KH-ID required). - // hashcat mode 37100 requires MDID + R0KH-ID + R1KH-ID to crack the PMK chain. - // [hcxpcapngtool:2541] condition: mdidlen!=0 && r0khidlen!=0 && r1khidlen!=0 let ft_ctx: Option<&FtFields> = if is_ft { match entry.ft.as_ref().filter(|ft| ft.r0khid_len > 0) { Some(ft) => Some(ft), - None => continue, // FT-PSK PMKID without FT context -- not crackable + None => continue, } } else { None }; - // An empty `ssids` slice means we never observed a beacon / probe-resp / - // assoc for this AP. A hash line with a NULL ESSID is not crackable - // (hashcat needs the ESSID to derive the PMK), so we drop the would-be - // emission, track the AP for the per-AP `[essid_not_found_summary]` - // log line at the end of the run, and continue with the next entry. if ssids.is_empty() { *unresolved_drops.entry(entry.ap).or_insert(0) += 1; stats.essid_unresolved_emissions += 1; @@ -671,7 +669,7 @@ impl OutputContext { } for essid in ssids { - let item = FanItem::Pmkid { entry, ft: ft_ctx, essid }; + let item = FanItem::Pmkid { entry: &entry, ft: ft_ctx, essid }; let written = fan_out(sinks, dedup, stats, ht, item)?; if written { stats.pmkids_written += 1; diff --git a/src/pair/mod.rs b/src/pair/mod.rs index 5d00199..c3d817b 100644 --- a/src/pair/mod.rs +++ b/src/pair/mod.rs @@ -130,8 +130,13 @@ pub fn estimate_group_cost(messages: &[EapolMessage]) -> u64 { /// Estimates the total pairing cost across all groups in a `MessageStore`. /// /// Used to pre-size the dedup `HashSet` so hashbrown never resizes mid-run. +/// In disk mode, returns 0 -- dedup pre-sizing is skipped (disk-backed dedup +/// handles it without pre-allocation). #[must_use] pub fn estimate_total_cost(store: &MessageStore) -> u64 { + if store.disk_mode() { + return 0; + } store.groups().map(|(_, msgs)| estimate_group_cost(msgs)).sum() } @@ -155,10 +160,13 @@ fn pair_one_group( /// can process and drop pairs immediately, bounding peak memory to one group's /// output at a time. /// -/// When `thread_count > 1`, uses rayon's work-stealing `par_iter` for parallel -/// pairing. The `on_group` callback is serialized via a `Mutex` so I/O-bound -/// fan-out (writing to `BufWriter`s) does not need to be thread-safe. Pairing -/// itself runs fully parallel across cores. +/// When `thread_count > 1` and memory mode is active, uses rayon's work-stealing +/// `par_iter` for parallel pairing. The `on_group` callback is serialized via a +/// `Mutex` so I/O-bound fan-out (writing to `BufWriter`s) does not need to be +/// thread-safe. Pairing itself runs fully parallel across cores. +/// +/// In disk mode, iterates group keys single-threaded and loads each group lazily +/// from the temp file. Only one group's messages are in memory at a time. /// /// Returns the aggregate `NcDedupStats` across all groups. pub fn pair_all_groups_streaming( @@ -171,6 +179,10 @@ pub fn pair_all_groups_streaming( where F: Fn(Vec) + Send + Sync, { + if store.disk_mode() { + return pair_all_groups_disk(store, config, debug, on_group); + } + let groups: Vec<(&MacPair, &Vec)> = store.groups().collect(); if groups.is_empty() { @@ -234,6 +246,45 @@ where all_nc.into_inner().unwrap_or_default() } +/// Disk-mode pairing: iterates group keys single-threaded, loading one group at a time. +fn pair_all_groups_disk(store: &MessageStore, config: &PairConfig, debug: &DebugPrinter, on_group: F) -> NcDedupStats +where + F: Fn(Vec), +{ + let keys: Vec = store.group_keys().collect(); + if keys.is_empty() { + return NcDedupStats::default(); + } + + let total_groups = keys.len(); + let mut groups_done: usize = 0; + let mut pairs_done: usize = 0; + let mut all_nc = NcDedupStats::default(); + + for mac_pair in &keys { + let messages = store.load_group(mac_pair); + if messages.is_empty() { + continue; + } + let (m1, m2, m3, m4, cost) = group_counts_and_cost(&messages); + debug.group_start(mac_pair.ap, mac_pair.sta, m1, m2, m3, m4, cost); + let t0 = Instant::now(); + let (pairs, nc) = pair_one_group(mac_pair, &messages, config); + let elapsed_us = t0.elapsed().as_micros(); + debug.group_done(mac_pair.ap, mac_pair.sta, pairs.len(), elapsed_us, cost); + groups_done += 1; + pairs_done += pairs.len(); + if groups_done.is_multiple_of(group_progress_interval()) || groups_done == total_groups { + debug.group_progress(groups_done, total_groups, pairs_done); + } + merge_nc_stats(&mut all_nc, nc); + on_group(pairs); + // `messages` dropped here -- memory freed before next group loads + } + + all_nc +} + /// Collects all pairs into a `Vec`. Prefer `pair_all_groups_streaming` when /// peak memory matters -- this wrapper materializes the full pair set. #[must_use] diff --git a/src/store/disk_messages.rs b/src/store/disk_messages.rs new file mode 100644 index 0000000..d242083 --- /dev/null +++ b/src/store/disk_messages.rs @@ -0,0 +1,420 @@ +//! Disk-backed binary serialization for [`EapolMessage`] and [`PmkidEntry`]. +//! +//! Used by the disk fallback to spill messages to a temp file during Phase 1 +//! and read them back one group at a time during Phase 4. No serde — the +//! format is a fixed header followed by variable-length frame bytes. +//! +//! # Wire format (`EapolMessage`) +//! +//! ```text +//! offset len field +//! 0 8 timestamp (u64 LE) +//! 8 1 msg_type (u8: 1=M1, 2=M2, 3=M3, 4=M4) +//! 9 1 key_version (u8) +//! 10 8 replay_counter (u64 LE) +//! 18 32 nonce ([u8; 32]) +//! 50 1 mic_len (u8: 16 or 24) +//! 51 24 mic_data ([u8; 24], zero-padded if mic_len < 24) +//! 75 1 has_pmkid (0 or 1) +//! 76 16 pmkid ([u8; 16], zeroed if has_pmkid == 0) +//! 92 1 akm (u8) +//! 93 1 is_rsn (0 or 1) +//! 94 1 has_ft (0 or 1) +//! 95 4 frame_len (u32 LE) +//! --- 99 bytes fixed header --- +//! 99 57 ft_fields (only when has_ft == 1) +//! mdid(2 LE) + r0khid_len(1) + r0khid(48) + r1khid(6) +//! 99|156 N eapol_frame bytes (frame_len bytes) +//! ``` + +use std::io::{Read, Write}; +use std::sync::Arc; + +use crate::store::messages::EapolMessage; +use crate::store::pmkid::PmkidEntry; +use crate::types::{AkmType, FtFields, MacAddr, MicBytes, MsgType, PmkidSource}; + +/// Fixed header size for `EapolMessage` serialization. +const EAPOL_HEADER_LEN: usize = 99; + +/// `FtFields` serialized size. +const FT_FIELDS_LEN: usize = 57; + +/// Fixed header size for `PmkidEntry` serialization. +const PMKID_HEADER_LEN: usize = 46; + +/// Serializes an `EapolMessage` to `writer`. Returns the number of bytes written. +/// +/// # Errors +/// +/// Returns `Err` on I/O failure. +#[allow( + clippy::indexing_slicing, + clippy::cast_possible_truncation, + reason = "buffer sizes are compile-time constants matching the wire format" +)] +pub fn write_eapol_message(w: &mut impl Write, msg: &EapolMessage) -> std::io::Result { + let mut buf = [0u8; EAPOL_HEADER_LEN]; + buf[0..8].copy_from_slice(&msg.timestamp.to_le_bytes()); + buf[8] = msg.msg_type as u8; + buf[9] = msg.key_version; + buf[10..18].copy_from_slice(&msg.replay_counter.to_le_bytes()); + buf[18..50].copy_from_slice(&msg.nonce); + #[allow(clippy::cast_possible_truncation, reason = "MIC is max 24 bytes, always fits in u8")] + let mic_len = msg.mic.len() as u8; + buf[50] = mic_len; + buf[51..51 + msg.mic.len()].copy_from_slice(msg.mic.as_slice()); + if let Some(pmkid) = &msg.pmkid { + buf[75] = 1; + buf[76..92].copy_from_slice(pmkid); + } + buf[92] = msg.akm.to_byte(); + buf[93] = u8::from(msg.is_rsn); + buf[94] = u8::from(msg.ft.is_some()); + let frame = msg.eapol_frame.as_ref(); + let frame_len = u32::try_from(frame.len()).unwrap_or(u32::MAX); + buf[95..99].copy_from_slice(&frame_len.to_le_bytes()); + w.write_all(&buf)?; + + let mut total = EAPOL_HEADER_LEN as u32; + if let Some(ft) = &msg.ft { + let ft_buf = serialize_ft_fields(ft); + w.write_all(&ft_buf)?; + total += FT_FIELDS_LEN as u32; + } + w.write_all(frame)?; + total += frame_len; + Ok(total) +} + +/// Deserializes an `EapolMessage` from `reader`. +/// +/// # Errors +/// +/// Returns `Err` on I/O failure or if the data is malformed. +#[allow( + clippy::indexing_slicing, + clippy::unwrap_used, + clippy::cast_possible_truncation, + reason = "buffer sizes are compile-time constants matching the wire format" +)] +pub fn read_eapol_message(r: &mut impl Read) -> std::io::Result { + let mut buf = [0u8; EAPOL_HEADER_LEN]; + r.read_exact(&mut buf)?; + + let timestamp = u64::from_le_bytes(buf[0..8].try_into().unwrap_or([0; 8])); + let msg_type = match buf[8] { + 1 => MsgType::M1, + 2 => MsgType::M2, + 3 => MsgType::M3, + 4 => MsgType::M4, + _ => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "bad msg_type")), + }; + let key_version = buf[9]; + let replay_counter = u64::from_le_bytes(buf[10..18].try_into().unwrap_or([0; 8])); + let mut nonce = [0u8; 32]; + nonce.copy_from_slice(&buf[18..50]); + let mic_len = buf[50]; + let mic = if mic_len == 24 { + MicBytes::from_24(buf[51..75].try_into().unwrap_or([0; 24])) + } else { + MicBytes::from_16(buf[51..67].try_into().unwrap_or([0; 16])) + }; + let pmkid = if buf[75] != 0 { + let mut p = [0u8; 16]; + p.copy_from_slice(&buf[76..92]); + Some(p) + } else { + None + }; + let akm = AkmType::from_byte(buf[92]); + let is_rsn = buf[93] != 0; + let has_ft = buf[94] != 0; + let frame_len = u32::from_le_bytes(buf[95..99].try_into().unwrap_or([0; 4])) as usize; + + let ft = if has_ft { + let mut ft_buf = [0u8; FT_FIELDS_LEN]; + r.read_exact(&mut ft_buf)?; + Some(Box::new(deserialize_ft_fields(&ft_buf))) + } else { + None + }; + + let mut frame_bytes = vec![0u8; frame_len]; + r.read_exact(&mut frame_bytes)?; + let eapol_frame: Arc<[u8]> = frame_bytes.into(); + + Ok(EapolMessage { + timestamp, + msg_type, + key_version, + replay_counter, + nonce, + mic, + pmkid, + eapol_frame, + ft, + akm, + is_rsn, + }) +} + +/// Serializes an `FtFields` to a fixed 57-byte buffer. +#[allow(clippy::indexing_slicing, reason = "buffer size is a compile-time constant matching the wire format")] +fn serialize_ft_fields(ft: &FtFields) -> [u8; FT_FIELDS_LEN] { + let mut buf = [0u8; FT_FIELDS_LEN]; + buf[0..2].copy_from_slice(&ft.mdid); + buf[2] = ft.r0khid_len; + buf[3..51].copy_from_slice(&ft.r0khid); + buf[51..57].copy_from_slice(&ft.r1khid); + buf +} + +/// Deserializes an `FtFields` from a 57-byte buffer. +#[allow(clippy::indexing_slicing, reason = "buffer size is a compile-time constant matching the wire format")] +fn deserialize_ft_fields(buf: &[u8; FT_FIELDS_LEN]) -> FtFields { + let mut mdid = [0u8; 2]; + mdid.copy_from_slice(&buf[0..2]); + let r0khid_len = buf[2]; + let mut r0khid = [0u8; 48]; + r0khid.copy_from_slice(&buf[3..51]); + let mut r1khid = [0u8; 6]; + r1khid.copy_from_slice(&buf[51..57]); + FtFields { mdid, r0khid_len, r0khid, r1khid } +} + +// --- PmkidEntry serialization --- + +/// Serializes a `PmkidEntry` to `writer`. Returns the number of bytes written. +/// +/// # Errors +/// +/// Returns `Err` on I/O failure. +#[allow( + clippy::indexing_slicing, + clippy::cast_possible_truncation, + reason = "buffer sizes are compile-time constants matching the wire format" +)] +pub fn write_pmkid_entry(w: &mut impl Write, entry: &PmkidEntry) -> std::io::Result { + let mut buf = [0u8; PMKID_HEADER_LEN]; + buf[0..8].copy_from_slice(&entry.timestamp.to_le_bytes()); + buf[8..14].copy_from_slice(&entry.ap.0); + buf[14..20].copy_from_slice(&entry.sta.0); + buf[20..36].copy_from_slice(&entry.pmkid); + buf[36] = entry.source.to_byte(); + buf[37] = entry.akm.to_byte(); + buf[38] = u8::from(entry.ft.is_some()); + // 7 bytes reserved (39..46) for future fields + w.write_all(&buf)?; + + let mut total = PMKID_HEADER_LEN as u32; + if let Some(ft) = &entry.ft { + let ft_buf = serialize_ft_fields(ft); + w.write_all(&ft_buf)?; + total += FT_FIELDS_LEN as u32; + } + Ok(total) +} + +/// Deserializes a `PmkidEntry` from `reader`. +/// +/// # Errors +/// +/// Returns `Err` on I/O failure. +#[allow( + clippy::indexing_slicing, + clippy::unwrap_used, + reason = "buffer sizes are compile-time constants matching the wire format" +)] +pub fn read_pmkid_entry(r: &mut impl Read) -> std::io::Result { + let mut buf = [0u8; PMKID_HEADER_LEN]; + r.read_exact(&mut buf)?; + + let timestamp = u64::from_le_bytes(buf[0..8].try_into().unwrap_or([0; 8])); + let ap = MacAddr(buf[8..14].try_into().unwrap_or([0; 6])); + let sta = MacAddr(buf[14..20].try_into().unwrap_or([0; 6])); + let mut pmkid = [0u8; 16]; + pmkid.copy_from_slice(&buf[20..36]); + let source = PmkidSource::from_byte(buf[36]); + let akm = AkmType::from_byte(buf[37]); + let has_ft = buf[38] != 0; + + let ft = if has_ft { + let mut ft_buf = [0u8; FT_FIELDS_LEN]; + r.read_exact(&mut ft_buf)?; + Some(Box::new(deserialize_ft_fields(&ft_buf))) + } else { + None + }; + + Ok(PmkidEntry { timestamp, ap, sta, pmkid, source, akm, ft }) +} + +// --- Unit tests --- + +#[cfg(test)] +mod tests { + #![allow(clippy::unwrap_used, clippy::indexing_slicing, missing_docs, reason = "test module")] + + use super::*; + + fn make_test_message(msg_type: MsgType, with_ft: bool, with_pmkid: bool) -> EapolMessage { + let ft = if with_ft { + Some(Box::new(FtFields { + mdid: [0x12, 0x34], + r0khid_len: 6, + r0khid: { + let mut r = [0u8; 48]; + r[..6].copy_from_slice(&[0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF]); + r + }, + r1khid: [0x11, 0x22, 0x33, 0x44, 0x55, 0x66], + })) + } else { + None + }; + let pmkid = if with_pmkid { Some([0x42u8; 16]) } else { None }; + + EapolMessage { + timestamp: 1_700_000_000_000_000, + msg_type, + key_version: 2, + replay_counter: 42, + nonce: { + let mut n = [0u8; 32]; + n[0] = 0xA5; + n[31] = 0x5A; + n + }, + mic: MicBytes::from_16([0x11; 16]), + pmkid, + eapol_frame: Arc::from(vec![0xAA, 0xBB, 0xCC, 0xDD, 0xEE].as_slice()), + ft, + akm: AkmType::Wpa2Psk, + is_rsn: true, + } + } + + #[test] + fn eapol_round_trip_no_ft_no_pmkid() { + let msg = make_test_message(MsgType::M2, false, false); + let mut buf = Vec::new(); + let written = write_eapol_message(&mut buf, &msg).unwrap(); + assert_eq!(written as usize, buf.len()); + + let mut cursor = std::io::Cursor::new(&buf); + let restored = read_eapol_message(&mut cursor).unwrap(); + + assert_eq!(restored.timestamp, msg.timestamp); + assert_eq!(restored.msg_type, msg.msg_type); + assert_eq!(restored.key_version, msg.key_version); + assert_eq!(restored.replay_counter, msg.replay_counter); + assert_eq!(restored.nonce, msg.nonce); + assert_eq!(restored.mic.as_slice(), msg.mic.as_slice()); + assert_eq!(restored.pmkid, msg.pmkid); + assert_eq!(restored.eapol_frame.as_ref(), msg.eapol_frame.as_ref()); + assert!(restored.ft.is_none()); + assert_eq!(restored.akm, msg.akm); + assert_eq!(restored.is_rsn, msg.is_rsn); + } + + #[test] + fn eapol_round_trip_with_ft_and_pmkid() { + let msg = make_test_message(MsgType::M1, true, true); + let mut buf = Vec::new(); + write_eapol_message(&mut buf, &msg).unwrap(); + + let mut cursor = std::io::Cursor::new(&buf); + let restored = read_eapol_message(&mut cursor).unwrap(); + + assert_eq!(restored.pmkid, Some([0x42; 16])); + let ft = restored.ft.as_ref().unwrap(); + assert_eq!(ft.mdid, [0x12, 0x34]); + assert_eq!(ft.r0khid_len, 6); + assert_eq!(ft.r1khid, [0x11, 0x22, 0x33, 0x44, 0x55, 0x66]); + } + + #[test] + fn eapol_round_trip_24_byte_mic() { + let mut msg = make_test_message(MsgType::M3, false, false); + msg.mic = MicBytes::from_24([0xCC; 24]); + let mut buf = Vec::new(); + write_eapol_message(&mut buf, &msg).unwrap(); + + let mut cursor = std::io::Cursor::new(&buf); + let restored = read_eapol_message(&mut cursor).unwrap(); + assert_eq!(restored.mic.len(), 24); + assert_eq!(restored.mic.as_slice(), &[0xCC; 24]); + } + + #[test] + fn multiple_messages_sequential() { + let msgs = vec![ + make_test_message(MsgType::M1, false, true), + make_test_message(MsgType::M2, false, false), + make_test_message(MsgType::M3, true, false), + make_test_message(MsgType::M4, false, false), + ]; + + let mut buf = Vec::new(); + for msg in &msgs { + write_eapol_message(&mut buf, msg).unwrap(); + } + + let mut cursor = std::io::Cursor::new(&buf); + for original in &msgs { + let restored = read_eapol_message(&mut cursor).unwrap(); + assert_eq!(restored.msg_type, original.msg_type); + assert_eq!(restored.timestamp, original.timestamp); + assert_eq!(restored.eapol_frame.as_ref(), original.eapol_frame.as_ref()); + } + } + + #[test] + fn pmkid_round_trip_no_ft() { + let entry = PmkidEntry { + timestamp: 999_000, + ap: MacAddr([0x02, 0x00, 0x00, 0x00, 0x00, 0xAA]), + sta: MacAddr([0x02, 0x00, 0x00, 0x00, 0x00, 0xBB]), + pmkid: [0x55; 16], + source: PmkidSource::M1KeyData, + akm: AkmType::Wpa2Psk, + ft: None, + }; + + let mut buf = Vec::new(); + write_pmkid_entry(&mut buf, &entry).unwrap(); + + let mut cursor = std::io::Cursor::new(&buf); + let restored = read_pmkid_entry(&mut cursor).unwrap(); + assert_eq!(restored.timestamp, entry.timestamp); + assert_eq!(restored.ap, entry.ap); + assert_eq!(restored.sta, entry.sta); + assert_eq!(restored.pmkid, entry.pmkid); + assert_eq!(restored.akm, entry.akm); + assert!(restored.ft.is_none()); + } + + #[test] + fn pmkid_round_trip_with_ft() { + let entry = PmkidEntry { + timestamp: 42, + ap: MacAddr([0xFF; 6]), + sta: MacAddr([0x11; 6]), + pmkid: [0xAB; 16], + source: PmkidSource::FtAuthStaToAp, + akm: AkmType::FtPsk, + ft: Some(Box::new(FtFields { mdid: [0x56, 0x78], r0khid_len: 3, r0khid: [0u8; 48], r1khid: [0xDE; 6] })), + }; + + let mut buf = Vec::new(); + write_pmkid_entry(&mut buf, &entry).unwrap(); + + let mut cursor = std::io::Cursor::new(&buf); + let restored = read_pmkid_entry(&mut cursor).unwrap(); + assert_eq!(restored.akm, AkmType::FtPsk); + let ft = restored.ft.unwrap(); + assert_eq!(ft.mdid, [0x56, 0x78]); + assert_eq!(ft.r1khid, [0xDE; 6]); + } +} diff --git a/src/store/messages.rs b/src/store/messages.rs index 55b5a0b..6c434f6 100644 --- a/src/store/messages.rs +++ b/src/store/messages.rs @@ -8,9 +8,11 @@ //! buffer shared across all pairs. See `ARCHITECTURE.md §3.3` and `§4` (invariant 2). use std::collections::{HashMap, HashSet}; +use std::io::{BufReader, BufWriter, Seek, SeekFrom, Write as _}; use std::sync::Arc; use crate::ieee80211::eapol::EapolKey; +use crate::store::disk_messages::{read_eapol_message, write_eapol_message}; use crate::types::{AkmType, FtFields, MacAddr, MacPair, MicBytes, MsgType}; // --- EapolMessage --- @@ -94,16 +96,39 @@ pub enum Admission { // --- MessageStore --- +/// Lightweight reference to a serialized message on disk. +#[derive(Debug, Clone, Copy)] +struct MessageRef { + offset: u64, +} + /// Primary storage for EAPOL messages grouped by (AP, STA) pair. /// -/// Uses `HashMap>` -- each (AP, STA) pair gets its own -/// append-only vector. Messages are never evicted by timestamp or replay counter. -/// The pairing engine (Phase 4) reads from this store after all packets are collected. -/// See `ARCHITECTURE.md §3.3` and `§5.1`. -#[derive(Debug, Default)] +/// Uses `HashMap>` in memory mode. When disk mode +/// activates (memory pressure), messages are flushed to a temp file and replaced +/// with lightweight `MessageRef` offsets. New messages go directly to disk. +/// The pairing engine reads groups lazily from disk during Phase 4. +#[derive(Default)] pub struct MessageStore { groups: HashMap>, total_count: usize, + disk_index: HashMap>, + disk_writer: Option>, + disk_path: Option, + disk_offset: u64, + disk_mode: bool, +} + +// Default is derived via #[derive(Default)] on the struct. + +impl std::fmt::Debug for MessageStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MessageStore") + .field("total_count", &self.total_count) + .field("group_count", &self.group_count()) + .field("disk_mode", &self.disk_mode) + .finish_non_exhaustive() + } } impl MessageStore { @@ -139,6 +164,9 @@ impl MessageStore { /// principle be tagged with a different `AkmType` if the surrounding /// RSN-IE context advanced between two observations. pub fn add(&mut self, ap: MacAddr, sta: MacAddr, msg: EapolMessage) -> Admission { + if self.disk_mode { + return self.add_to_disk(ap, sta, &msg); + } let pair = MacPair::new(ap, sta); let entries = self.groups.entry(pair).or_default(); @@ -150,7 +178,24 @@ impl MessageStore { Admission::Stored } + /// Appends a message directly to the disk file (disk mode only). + fn add_to_disk(&mut self, ap: MacAddr, sta: MacAddr, msg: &EapolMessage) -> Admission { + let Some(writer) = &mut self.disk_writer else { + return Admission::Duplicate; + }; + let Ok(written) = write_eapol_message(writer, msg) else { + return Admission::Duplicate; + }; + let pair = MacPair::new(ap, sta); + let refs = self.disk_index.entry(pair).or_default(); + refs.push(MessageRef { offset: self.disk_offset }); + self.disk_offset += u64::from(written); + self.total_count += 1; + Admission::Stored + } + /// Iterates over all (AP, STA) groups and their message vectors. + /// Only valid in memory mode. In disk mode, use `group_keys()` + `load_group()`. pub fn groups(&self) -> impl Iterator)> { self.groups.iter() } @@ -164,7 +209,7 @@ impl MessageStore { /// Returns the number of distinct (AP, STA) groups. #[must_use] pub fn group_count(&self) -> usize { - self.groups.len() + if self.disk_mode { self.disk_index.len() } else { self.groups.len() } } /// Drops every group and resets the total-message counter. @@ -175,9 +220,99 @@ impl MessageStore { /// per-file pair count is similar across files). pub fn clear(&mut self) { self.groups.clear(); + self.disk_index.clear(); self.total_count = 0; } + /// Returns `true` if the store is operating in disk-backed mode. + #[must_use] + pub const fn disk_mode(&self) -> bool { + self.disk_mode + } + + /// Flushes all in-memory messages to a temp file and switches to disk mode. + /// + /// After this call, `groups` is empty and `disk_index` holds lightweight + /// references into the temp file. New messages arriving via `add()` are + /// serialized directly to disk. + /// + /// # Errors + /// + /// Returns `Err` if the temp file cannot be created or written. + pub fn flush_to_disk(&mut self) -> crate::types::Result<()> { + if self.disk_mode { + return Ok(()); + } + let dir = std::env::temp_dir(); + let path = dir.join(format!("wpawolf_messages_{}.bin", std::process::id())); + let file = std::fs::File::create(&path)?; + let mut writer = BufWriter::new(file); + let mut offset: u64 = 0; + + let old_groups = std::mem::take(&mut self.groups); + for (pair, msgs) in old_groups { + let mut refs = Vec::with_capacity(msgs.len()); + for msg in &msgs { + let written = write_eapol_message(&mut writer, msg).map_err(crate::types::Error::Io)?; + refs.push(MessageRef { offset }); + offset += u64::from(written); + } + self.disk_index.insert(pair, refs); + } + writer.flush().map_err(crate::types::Error::Io)?; + self.disk_writer = Some(writer); + self.disk_path = Some(path); + self.disk_offset = offset; + self.disk_mode = true; + Ok(()) + } + + /// Returns an iterator over group keys. In disk mode, iterates the disk + /// index keys (cheap -- no messages loaded). In memory mode, iterates the + /// in-memory `HashMap` keys. + #[must_use] + #[allow(clippy::type_complexity, reason = "single return site; a type alias adds indirection without clarity")] + pub fn group_keys(&self) -> Box + '_> { + if self.disk_mode { Box::new(self.disk_index.keys().copied()) } else { Box::new(self.groups.keys().copied()) } + } + + /// Loads all messages for a single group from disk. Only valid in disk mode. + /// + /// # Panics + /// + /// Panics if called when not in disk mode. + #[must_use] + pub fn load_group(&self, key: &MacPair) -> Vec { + assert!(self.disk_mode, "load_group called in memory mode"); + let Some(refs) = self.disk_index.get(key) else { + return Vec::new(); + }; + let Some(path) = &self.disk_path else { + return Vec::new(); + }; + let Ok(file) = std::fs::File::open(path) else { + return Vec::new(); + }; + let mut reader = BufReader::new(file); + let mut messages = Vec::with_capacity(refs.len()); + for mref in refs { + if reader.seek(SeekFrom::Start(mref.offset)).is_ok() + && let Ok(msg) = read_eapol_message(&mut reader) + { + messages.push(msg); + } + } + messages + } + + /// Cleans up the temp file. Called on shutdown. + pub fn cleanup_disk(&mut self) { + if let Some(path) = self.disk_path.take() { + let _ = std::fs::remove_file(path); + } + self.disk_writer = None; + } + /// Coarse heap + struct-bytes estimate for `--mem-stats` reporting. /// /// Sums the `HashMap` bucket overhead, every `Vec` allocation, @@ -188,11 +323,15 @@ impl MessageStore { /// upper-bound on the EAPOL-store footprint. #[must_use] pub fn approx_bytes(&self) -> usize { + if self.disk_mode { + let index_bytes = self.disk_index.capacity() * (size_of::() + size_of::>() + 8); + let refs_bytes: usize = self.disk_index.values().map(|v| v.capacity() * size_of::()).sum(); + return size_of::() + index_bytes + refs_bytes; + } let groups_cap_bytes = self.groups.capacity() * (size_of::() + size_of::>() + 8); let mut msgs_bytes = 0usize; for v in self.groups.values() { msgs_bytes += v.capacity() * size_of::(); - // Arc<[u8]> heap payload per message: 16-byte ArcInner header + bytes. for m in v { msgs_bytes = msgs_bytes.saturating_add(m.eapol_frame.len() + 16); } @@ -213,6 +352,9 @@ impl MessageStore { where F: FnMut(MacAddr) -> MacAddr, { + if self.disk_mode { + return self.canonicalize_pairs_disk(&mut canonicalize); + } let old = std::mem::take(&mut self.groups); let old_group_count = old.len(); let old_total = self.total_count; @@ -220,19 +362,30 @@ impl MessageStore { for (pair, mut msgs) in old { let canon_ap = canonicalize(pair.ap); let canon_sta = canonicalize(pair.sta); - // Nothing changed for this pair if both addresses already equal the canonical form. let canon_pair = MacPair::new(canon_ap, canon_sta); self.total_count += msgs.len(); - // Messages may carry any addresses in their frame-level fields; the EapolMessage - // struct does not store ap/sta per-message (they live in the store key), so we - // only need to rewrite the key. self.groups.entry(canon_pair).or_default().append(&mut msgs); } debug_assert_eq!(self.total_count, old_total, "canonicalization must not drop messages"); - // Merged groups = (old distinct keys) - (new distinct keys). (old_group_count as u64).saturating_sub(self.groups.len() as u64) } + /// Disk-mode canonicalization: rewrite index keys without loading message data. + fn canonicalize_pairs_disk(&mut self, canonicalize: &mut F) -> u64 + where + F: FnMut(MacAddr) -> MacAddr, + { + let old_index = std::mem::take(&mut self.disk_index); + let old_group_count = old_index.len(); + for (pair, refs) in old_index { + let canon_ap = canonicalize(pair.ap); + let canon_sta = canonicalize(pair.sta); + let canon_pair = MacPair::new(canon_ap, canon_sta); + self.disk_index.entry(canon_pair).or_default().extend(refs); + } + (old_group_count as u64).saturating_sub(self.disk_index.len() as u64) + } + /// Folds the earliest and latest message timestamps for every AP MAC in /// `wanted` into the `out` map. /// @@ -245,6 +398,20 @@ impl MessageStore { /// "`essid_not_found`" APs so the operator can locate the source frames in /// the original capture without having to grep the whole `MessageStore`. pub fn fold_timestamp_range_into(&self, wanted: &HashSet, out: &mut HashMap) { + if self.disk_mode { + for key in self.disk_index.keys() { + if !wanted.contains(&key.ap) { + continue; + } + let msgs = self.load_group(key); + for msg in &msgs { + let entry = out.entry(key.ap).or_insert((u64::MAX, 0)); + entry.0 = entry.0.min(msg.timestamp); + entry.1 = entry.1.max(msg.timestamp); + } + } + return; + } for (mac_pair, msgs) in &self.groups { if !wanted.contains(&mac_pair.ap) { continue; @@ -273,6 +440,9 @@ impl MessageStore { /// `--eapoltimeout` filter would be worthwhile. #[must_use] pub fn count_anonce_m1_m3_mismatches(&self) -> u64 { + if self.disk_mode { + return self.count_anonce_m1_m3_mismatches_disk(); + } let mut mismatches: u64 = 0; for msgs in self.groups.values() { // Collect all distinct M1 and M3 ANonces in this group. @@ -309,6 +479,36 @@ impl MessageStore { } mismatches } + + fn count_anonce_m1_m3_mismatches_disk(&self) -> u64 { + let mut mismatches: u64 = 0; + for key in self.disk_index.keys() { + let msgs = self.load_group(key); + let mut m1_nonces: Vec<[u8; 32]> = Vec::new(); + let mut m3_nonces: Vec<[u8; 32]> = Vec::new(); + for msg in &msgs { + let bucket = match msg.msg_type { + MsgType::M1 => &mut m1_nonces, + MsgType::M3 => &mut m3_nonces, + MsgType::M2 | MsgType::M4 => continue, + }; + if !bucket.contains(&msg.nonce) { + bucket.push(msg.nonce); + } + } + let multi_m1 = m1_nonces.len() > 1; + let multi_m3 = m3_nonces.len() > 1; + let cross_mismatch = if !m1_nonces.is_empty() && !m3_nonces.is_empty() { + m1_nonces.iter().any(|n1| m3_nonces.iter().any(|n3| n1 != n3)) + } else { + false + }; + if multi_m1 || multi_m3 || cross_mismatch { + mismatches += 1; + } + } + mismatches + } } // --- PendingEapol (deferred WDS EAPOL) --- diff --git a/src/store/mod.rs b/src/store/mod.rs index 5f3d6d6..3e4f9f3 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -10,6 +10,7 @@ //! memory budget estimates. pub mod auxiliary; +pub mod disk_messages; pub mod essid; pub mod fragments; pub mod messages; diff --git a/src/store/pmkid.rs b/src/store/pmkid.rs index ff7fe25..22d1bb8 100644 --- a/src/store/pmkid.rs +++ b/src/store/pmkid.rs @@ -15,7 +15,9 @@ //! catalogue. use std::collections::HashMap; +use std::io::{BufReader, BufWriter, Seek, SeekFrom, Write as _}; +use crate::store::disk_messages::{read_pmkid_entry, write_pmkid_entry}; use crate::types::{AkmType, FtFields, MacAddr, MacPair, PmkidSource}; // --- PmkidEntry --- @@ -46,14 +48,36 @@ pub struct PmkidEntry { // --- PmkidStore --- +/// Lightweight reference to a serialized PMKID entry on disk. +#[derive(Debug, Clone, Copy)] +struct PmkidRef { + offset: u64, +} + /// Storage for PMKIDs, grouped by (AP, STA) pair with per-pair deduplication. /// /// When the same 16-byte PMKID value is seen multiple times for the same pair /// (e.g., in both M1 Key Data and M2 RSN IE), only the first occurrence is kept. /// Different PMKID values for the same pair are all stored. See `ARCHITECTURE.md §6`. -#[derive(Debug, Default)] +#[derive(Default)] pub struct PmkidStore { groups: HashMap>, + disk_index: HashMap>, + disk_writer: Option>, + disk_path: Option, + disk_offset: u64, + disk_mode: bool, +} + +// Default is derived via #[derive(Default)] on the struct. + +impl std::fmt::Debug for PmkidStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PmkidStore") + .field("total_count", &self.total_count()) + .field("disk_mode", &self.disk_mode) + .finish_non_exhaustive() + } } impl PmkidStore { @@ -78,6 +102,9 @@ impl PmkidStore { if crate::types::garbage_pattern_kind(&entry.pmkid).is_some() { return false; } + if self.disk_mode { + return self.add_to_disk(&entry); + } let pair = MacPair::new(entry.ap, entry.sta); let entries = self.groups.entry(pair).or_default(); if entries.iter().any(|e| e.pmkid == entry.pmkid) { @@ -87,9 +114,101 @@ impl PmkidStore { true } + fn add_to_disk(&mut self, entry: &PmkidEntry) -> bool { + let Some(writer) = &mut self.disk_writer else { + return false; + }; + let Ok(written) = write_pmkid_entry(writer, entry) else { + return false; + }; + let pair = MacPair::new(entry.ap, entry.sta); + let refs = self.disk_index.entry(pair).or_default(); + refs.push(PmkidRef { offset: self.disk_offset }); + self.disk_offset += u64::from(written); + true + } + /// Iterates over all stored PMKID entries across all (AP, STA) pairs. - pub fn iter(&self) -> impl Iterator { - self.groups.values().flatten() + /// In disk mode, loads all entries from disk into a temporary Vec. + #[must_use] + #[allow( + clippy::iter_without_into_iter, + reason = "return type is Box; IntoIterator for &PmkidStore would add indirection without benefit" + )] + pub fn iter(&self) -> Box + '_> { + if self.disk_mode { + let all: Vec = self.load_all_entries(); + return Box::new(all.into_iter()); + } + Box::new(self.groups.values().flatten().cloned()) + } + + fn load_all_entries(&self) -> Vec { + let Some(path) = &self.disk_path else { + return Vec::new(); + }; + let Ok(file) = std::fs::File::open(path) else { + return Vec::new(); + }; + let mut reader = BufReader::new(file); + let mut entries = Vec::new(); + for refs in self.disk_index.values() { + for pref in refs { + if reader.seek(SeekFrom::Start(pref.offset)).is_ok() + && let Ok(entry) = read_pmkid_entry(&mut reader) + { + entries.push(entry); + } + } + } + entries + } + + /// Returns `true` if disk mode is active. + #[must_use] + pub const fn disk_mode(&self) -> bool { + self.disk_mode + } + + /// Flushes all in-memory PMKIDs to a temp file and switches to disk mode. + /// + /// # Errors + /// + /// Returns `Err` if the temp file cannot be created or written. + pub fn flush_to_disk(&mut self) -> crate::types::Result<()> { + if self.disk_mode { + return Ok(()); + } + let dir = std::env::temp_dir(); + let path = dir.join(format!("wpawolf_pmkids_{}.bin", std::process::id())); + let file = std::fs::File::create(&path)?; + let mut writer = BufWriter::new(file); + let mut offset: u64 = 0; + + let old_groups = std::mem::take(&mut self.groups); + for (pair, entries) in old_groups { + let mut refs = Vec::with_capacity(entries.len()); + for entry in &entries { + let written = write_pmkid_entry(&mut writer, entry).map_err(crate::types::Error::Io)?; + refs.push(PmkidRef { offset }); + offset += u64::from(written); + } + self.disk_index.insert(pair, refs); + } + writer.flush().map_err(crate::types::Error::Io)?; + self.disk_writer = Some(writer); + self.disk_path = Some(path); + self.disk_offset = offset; + self.disk_mode = true; + Ok(()) + } + + /// Cleans up the temp file. Called on shutdown. + pub fn cleanup_disk(&mut self) { + if let Some(path) = self.disk_path.take() { + let _ = std::fs::remove_file(path); + } + self.disk_writer = None; } /// Rewrites every group key and embedded AP/STA addresses using `canonicalize`. @@ -102,12 +221,21 @@ impl PmkidStore { where F: FnMut(MacAddr) -> MacAddr, { + if self.disk_mode { + let old_index = std::mem::take(&mut self.disk_index); + for (pair, refs) in old_index { + let canon_ap = canonicalize(pair.ap); + let canon_sta = canonicalize(pair.sta); + let canon_pair = MacPair::new(canon_ap, canon_sta); + self.disk_index.entry(canon_pair).or_default().extend(refs); + } + return; + } let old = std::mem::take(&mut self.groups); for (_pair, entries) in old { for mut entry in entries { entry.ap = canonicalize(entry.ap); entry.sta = canonicalize(entry.sta); - // Re-use add() so dedup-by-PMKID runs on the merged group. self.add(entry); } } @@ -116,6 +244,9 @@ impl PmkidStore { /// Returns the total number of unique PMKIDs stored. #[must_use] pub fn total_count(&self) -> usize { + if self.disk_mode { + return self.disk_index.values().map(Vec::len).sum(); + } self.groups.values().map(Vec::len).sum() } @@ -124,15 +255,17 @@ impl PmkidStore { /// reuses the existing buckets. pub fn clear(&mut self) { self.groups.clear(); + self.disk_index.clear(); } /// Coarse heap + struct-bytes estimate for `--mem-stats` reporting. - /// - /// Counts the `HashMap` bucket overhead, every `Vec` allocation, - /// and every `PmkidEntry` struct. Does not count `FtFields` heap, which is - /// rare (only FT-PSK PMKIDs). #[must_use] pub fn approx_bytes(&self) -> usize { + if self.disk_mode { + let index_bytes = self.disk_index.capacity() * (size_of::() + size_of::>() + 8); + let refs_bytes: usize = self.disk_index.values().map(|v| v.capacity() * size_of::()).sum(); + return size_of::() + index_bytes + refs_bytes; + } let groups_cap_bytes = self.groups.capacity() * (size_of::() + size_of::>() + 8); let mut entries_bytes = 0usize; for v in self.groups.values() { diff --git a/src/types.rs b/src/types.rs index 8e8bd8d..27438d3 100644 --- a/src/types.rs +++ b/src/types.rs @@ -182,6 +182,34 @@ impl AkmType { pub const fn is_psk_sha256(self) -> bool { matches!(self, Self::PskSha256) } + + /// Encodes as a `u8` for binary serialization. + #[must_use] + pub const fn to_byte(self) -> u8 { + match self { + Self::Wpa1 => 0, + Self::Wpa2Psk => 1, + Self::FtPsk => 2, + Self::FtPskSha384 => 3, + Self::PskSha256 => 4, + Self::PskSha384 => 5, + Self::Unknown => 255, + } + } + + /// Decodes from a `u8` produced by [`Self::to_byte`]. + #[must_use] + pub const fn from_byte(b: u8) -> Self { + match b { + 0 => Self::Wpa1, + 1 => Self::Wpa2Psk, + 2 => Self::FtPsk, + 3 => Self::FtPskSha384, + 4 => Self::PskSha256, + 5 => Self::PskSha384, + _ => Self::Unknown, + } + } } // --- Hash type (11-type classification) --- @@ -434,6 +462,60 @@ pub enum PmkidSource { OsenIe, } +impl PmkidSource { + /// Encodes as a `u8` for binary serialization. + #[must_use] + pub const fn to_byte(self) -> u8 { + match self { + Self::M1KeyData => 0, + Self::M2RsnIe => 1, + Self::AssocRequest => 2, + Self::ReassocRequest => 3, + Self::FtAuthStaToAp => 4, + Self::FtAuthApToSta => 5, + Self::FilsAuthStaToAp => 6, + Self::FilsAuthApToSta => 7, + Self::PasnAuthStaToAp => 8, + Self::PasnAuthApToSta => 9, + Self::FtActionRequest => 10, + Self::FtActionResponse => 11, + Self::FtActionConfirm => 12, + Self::ProbeRequest => 13, + Self::BeaconRsnIe => 14, + Self::ProbeRespRsnIe => 15, + Self::MeshPeeringOpen => 16, + Self::MeshPeeringConfirm => 17, + Self::OsenIe => 18, + } + } + + /// Decodes from a `u8` produced by [`Self::to_byte`]. + #[must_use] + pub const fn from_byte(b: u8) -> Self { + match b { + 1 => Self::M2RsnIe, + 2 => Self::AssocRequest, + 3 => Self::ReassocRequest, + 4 => Self::FtAuthStaToAp, + 5 => Self::FtAuthApToSta, + 6 => Self::FilsAuthStaToAp, + 7 => Self::FilsAuthApToSta, + 8 => Self::PasnAuthStaToAp, + 9 => Self::PasnAuthApToSta, + 10 => Self::FtActionRequest, + 11 => Self::FtActionResponse, + 12 => Self::FtActionConfirm, + 13 => Self::ProbeRequest, + 14 => Self::BeaconRsnIe, + 15 => Self::ProbeRespRsnIe, + 16 => Self::MeshPeeringOpen, + 17 => Self::MeshPeeringConfirm, + 18 => Self::OsenIe, + _ => Self::M1KeyData, // fallback for unknown bytes + } + } +} + // --- Error type --- /// All errors produced by wpawolf. From 7fa94acde61636a3efe930cc4a72f95454428889 Mon Sep 17 00:00:00 2001 From: StrongWind <5987034+StrongWind1@users.noreply.github.com> Date: Thu, 28 May 2026 00:05:57 -0400 Subject: [PATCH 02/14] feat: would_exceed guard on dedup reserve + ASCII fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add memory pressure check before PerSinkDedup::reserve(). When the estimated allocation (12 bytes × estimated_hashes × active_sink_count) would push RSS past the 80% threshold, skip pre-sizing entirely and let sets grow incrementally. This prevents the single-shot allocation OOM that occurs when the estimated cost is in the billions. Pass MemMonitor through OutputContext::emit() and run_output() so the output pipeline can check memory pressure before pre-sizing. Fix non-ASCII em-dashes in mem_monitor.rs and disk_messages.rs. --- src/main.rs | 2 ++ src/mem_monitor.rs | 4 ++-- src/output/mod.rs | 36 +++++++++++++++++++++++++++++------- src/store/disk_messages.rs | 2 +- 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/src/main.rs b/src/main.rs index 9f6b808..295e267 100644 --- a/src/main.rs +++ b/src/main.rs @@ -777,6 +777,7 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { thread_count, essid_filter, &debug, + &mut mem_monitor, )?; message_store.clear(); pmkid_store.clear(); @@ -927,6 +928,7 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { thread_count, essid_filter, &debug, + &mut mem_monitor, )?; debug.phase_done(4, "Emit", ""); diff --git a/src/mem_monitor.rs b/src/mem_monitor.rs index 57b7739..5b62546 100644 --- a/src/mem_monitor.rs +++ b/src/mem_monitor.rs @@ -71,7 +71,7 @@ impl MemMonitor { } /// Predicts whether allocating `additional_bytes` would exceed the threshold. - /// Does NOT activate disk mode — the caller decides what to do. + /// Does NOT activate disk mode -- the caller decides what to do. #[must_use] pub fn would_exceed(&mut self, additional_bytes: u64) -> bool { let rss = progress::current_rss_bytes(); @@ -183,7 +183,7 @@ mod tests { let mut m = MemMonitor::new(); m.force_disk_mode(); assert!(m.disk_mode()); - m.check(); // check again — should stay true + m.check(); // check again -- should stay true assert!(m.disk_mode()); } } diff --git a/src/output/mod.rs b/src/output/mod.rs index 3f703b1..de552e3 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -467,9 +467,20 @@ pub fn run_output( essid_filter: EssidFilterConfig, logger: &mut Logger, debug: &DebugPrinter, + mem_monitor: &mut crate::mem_monitor::MemMonitor, ) -> Result { let mut ctx = OutputContext::new(paths); - ctx.emit(message_store, pmkid_store, essid_map, akm_map, pair_config, thread_count, essid_filter, debug)?; + ctx.emit( + message_store, + pmkid_store, + essid_map, + akm_map, + pair_config, + thread_count, + essid_filter, + debug, + mem_monitor, + )?; ctx.finalize( paths, essid_set, @@ -584,22 +595,31 @@ impl OutputContext { thread_count: usize, essid_filter: EssidFilterConfig, debug: &DebugPrinter, + mem_monitor: &mut crate::mem_monitor::MemMonitor, ) -> Result<()> { // Pre-size the dedup sets so hashbrown never resizes mid-run. Without // this, a resize from 2^31 to 2^32 slots requires both tables to be // alive simultaneously (54 GiB transient spike at ~2B entries). // Only reserve for sinks that have a configured output path (fixes the // 9x over-allocation when only 1-2 sinks are active). + // Skip pre-sizing entirely if the allocation would exceed the memory + // threshold -- the sets will grow incrementally instead. let estimated_pairs = crate::pair::estimate_total_cost(message_store); let estimated_hashes = estimated_pairs.saturating_add(pmkid_store.total_count() as u64); if estimated_hashes > 0 { - #[allow( - clippy::cast_possible_truncation, - reason = "saturating to usize::MAX is safe -- HashSet clamps internally" - )] - let cap = usize::try_from(estimated_hashes).unwrap_or(usize::MAX); let active = self.sinks.active_mask(); - self.dedup.reserve(cap, &active); + let active_count = active.iter().filter(|&&a| a).count() as u64; + let estimated_bytes = estimated_hashes.saturating_mul(12).saturating_mul(active_count); + if mem_monitor.would_exceed(estimated_bytes) { + mem_monitor.force_disk_mode(); + } else { + #[allow( + clippy::cast_possible_truncation, + reason = "saturating to usize::MAX is safe -- HashSet clamps internally" + )] + let cap = usize::try_from(estimated_hashes).unwrap_or(usize::MAX); + self.dedup.reserve(cap, &active); + } } self.emit_inner( @@ -936,6 +956,7 @@ mod tests { let paths = OutputPaths::default(); let mut logger = Logger::new(None).unwrap(); + let mut mem_monitor = crate::mem_monitor::MemMonitor::new(); let stats = run_output( &msg_store, &pmkid_store, @@ -954,6 +975,7 @@ mod tests { EssidFilterConfig::default(), &mut logger, &DebugPrinter::new(false), + &mut mem_monitor, ) .unwrap(); diff --git a/src/store/disk_messages.rs b/src/store/disk_messages.rs index d242083..8c1c8e8 100644 --- a/src/store/disk_messages.rs +++ b/src/store/disk_messages.rs @@ -1,7 +1,7 @@ //! Disk-backed binary serialization for [`EapolMessage`] and [`PmkidEntry`]. //! //! Used by the disk fallback to spill messages to a temp file during Phase 1 -//! and read them back one group at a time during Phase 4. No serde — the +//! and read them back one group at a time during Phase 4. No serde -- the //! format is a fixed header followed by variable-length frame bytes. //! //! # Wire format (`EapolMessage`) From 6f9f2785448024f96332bc0d382e1ef85c9fe0ff Mon Sep 17 00:00:00 2001 From: StrongWind <5987034+StrongWind1@users.noreply.github.com> Date: Thu, 28 May 2026 00:27:18 -0400 Subject: [PATCH 03/14] feat: disk-backed dedup with partitioned bucket files + cleaning pass When the dedup HashSet would exceed the 80% RSS threshold, switch to write-through mode: hash lines go directly to output files (accepting temporary duplicates) while fingerprints are recorded in 256 partitioned bucket files per sink (fingerprint % 256, 16 bytes per record). After emission completes, a cleaning pass processes buckets one at a time: sort by fingerprint, identify runs with count > 1, collect line numbers to remove (keep first occurrence), then rewrite each output file without the duplicate lines. Mid-emission switchover is supported: if memory pressure activates during Phase 4 output, the in-memory HashSet is flushed to bucket files with sentinel line numbers (u64::MAX), the HashSet is drained to free memory, and emission continues in write-through mode. The cleaning pass handles the mixed state correctly. New module: src/output/disk_dedup.rs - DiskDedup: coordinator with per-sink bucket state - DiskDedupSink: 256 bucket file writers per sink - build_removal_set(): sort-based duplicate detection - rewrite_without_lines(): line-number-based output filter - Drop impl ensures bucket files are cleaned up - 5 unit tests covering no-dups, dups, sentinels, cleanup Modified: src/output/dedup.rs - SinkId::from_index() for index-to-enum conversion - PerSinkDedup::flush_to_buckets() for mid-emission switchover - PerSinkDedup::drain() to free HashSet memory Modified: src/output/mod.rs - fan_out() accepts Option for write-through mode - OutputContext holds disk_dedup state - Cleaning pass runs in finalize() before auxiliary outputs - HashSinks::path() accessor for cleaning pass --- src/output/dedup.rs | 43 ++++ src/output/disk_dedup.rs | 465 +++++++++++++++++++++++++++++++++++++++ src/output/mod.rs | 54 ++++- 3 files changed, 554 insertions(+), 8 deletions(-) create mode 100644 src/output/disk_dedup.rs diff --git a/src/output/dedup.rs b/src/output/dedup.rs index 7e2d32e..5b78723 100644 --- a/src/output/dedup.rs +++ b/src/output/dedup.rs @@ -126,6 +126,23 @@ impl SinkId { pub const fn as_index(self) -> usize { self as usize } + + /// Converts a numeric index back to a `SinkId`, or `None` if out of range. + #[must_use] + pub const fn from_index(idx: usize) -> Option { + match idx { + 0 => Some(Self::Out22000), + 1 => Some(Self::Out37100), + 2 => Some(Self::OutCombined), + 3 => Some(Self::OutWpa1), + 4 => Some(Self::OutWpa2), + 5 => Some(Self::OutPskSha256), + 6 => Some(Self::OutFt), + 7 => Some(Self::OutPskSha384), + 8 => Some(Self::OutFtPskSha384), + _ => None, + } + } } /// Per-sink deduplication filter. @@ -166,6 +183,32 @@ impl PerSinkDedup { } } + /// Flushes all in-memory fingerprints to a `DiskDedup`'s bucket files. + /// + /// Each fingerprint is recorded with `line_number = u64::MAX` (sentinel) + /// so the cleaning pass knows these were already deduped in memory and + /// don't correspond to output lines that need removal. + /// + /// # Errors + /// + /// Returns `Err` on I/O failure writing to bucket files. + pub fn flush_to_buckets(&self, disk_dedup: &mut super::disk_dedup::DiskDedup) -> crate::types::Result<()> { + for (idx, set) in self.sets.iter().enumerate() { + if set.is_empty() { + continue; + } + let Some(sink) = SinkId::from_index(idx) else { continue }; + disk_dedup.flush_hashset(sink, set)?; + } + Ok(()) + } + + /// Replaces all internal `HashSet`s with empty ones, freeing memory. + /// Called after `flush_to_buckets` during mid-emission switchover. + pub fn drain(&mut self) { + self.sets = Default::default(); + } + /// Returns `true` if this PMKID entry is new for `sink` and records the fingerprint. pub fn check_pmkid(&mut self, sink: SinkId, entry: &PmkidEntry, essid: &[u8]) -> bool { let fp = pmkid_fingerprint(entry, essid); diff --git a/src/output/disk_dedup.rs b/src/output/disk_dedup.rs new file mode 100644 index 0000000..9d651ce --- /dev/null +++ b/src/output/disk_dedup.rs @@ -0,0 +1,465 @@ +//! Disk-backed deduplication via partitioned fingerprint bucket files. +//! +//! When in-memory dedup (`PerSinkDedup`) would exceed the memory threshold, +//! this module takes over. Hash lines are written directly to output files +//! (write-through, accepting temporary duplicates). Each line's u64 `SipHash` +//! fingerprint plus its line number are appended to one of 256 bucket files +//! per sink (`fingerprint % 256`). After emission completes, a cleaning pass +//! loads buckets one at a time, identifies duplicate fingerprints, and rewrites +//! each output file without the duplicates. +//! +//! # Bucket file format +//! +//! Each bucket stores a sequence of 16-byte records (u64 LE pairs): +//! ```text +//! offset len field +//! 0 8 line_number (u64 LE) -- 0-based index within the sink's output +//! 8 8 fingerprint (u64 LE) -- SipHash-1-3 of the hash line fields +//! ``` +//! +//! Records with `line_number == u64::MAX` are sentinels from mid-emission +//! switchover: they represent fingerprints that were already deduped in memory +//! and count as "first occurrence" during the cleaning pass. + +use std::collections::HashSet; +use std::io::{BufRead, BufReader, BufWriter, Read, Write}; +use std::path::{Path, PathBuf}; + +use crate::output::dedup::SinkId; +use crate::types::Result; + +const NUM_BUCKETS: usize = 256; +const RECORD_SIZE: usize = 16; +const SENTINEL_LINE: u64 = u64::MAX; +const BUCKET_BUF_CAPACITY: usize = 32 * 1024; +const CLEAN_BUF_CAPACITY: usize = 64 * 1024; + +// --- DiskDedupSink --- + +/// Per-sink bucket file state. +struct DiskDedupSink { + bucket_writers: Vec>>, + bucket_dir: PathBuf, + line_count: u64, +} + +impl DiskDedupSink { + fn new(bucket_dir: PathBuf) -> Result { + std::fs::create_dir_all(&bucket_dir)?; + Ok(Self { bucket_writers: (0..NUM_BUCKETS).map(|_| None).collect(), bucket_dir, line_count: 0 }) + } + + #[allow(clippy::indexing_slicing, reason = "bucket_idx always < NUM_BUCKETS from % operation")] + fn get_or_create_writer(&mut self, bucket_idx: usize) -> Result<&mut BufWriter> { + if self.bucket_writers[bucket_idx].is_none() { + let path = self.bucket_dir.join(format!("b{bucket_idx:03}.bin")); + let file = std::fs::File::create(path)?; + self.bucket_writers[bucket_idx] = Some(BufWriter::with_capacity(BUCKET_BUF_CAPACITY, file)); + } + Ok(self.bucket_writers[bucket_idx].as_mut().unwrap_or_else(|| unreachable!())) + } + + fn record(&mut self, fingerprint: u64) -> Result<()> { + #[allow(clippy::cast_possible_truncation, reason = "NUM_BUCKETS is 256, fits u64 modulo")] + let bucket_idx = (fingerprint % NUM_BUCKETS as u64) as usize; + let line_number = self.line_count; + self.line_count += 1; + let writer = self.get_or_create_writer(bucket_idx)?; + writer.write_all(&line_number.to_le_bytes())?; + writer.write_all(&fingerprint.to_le_bytes())?; + Ok(()) + } + + fn record_sentinel(&mut self, fingerprint: u64) -> Result<()> { + #[allow(clippy::cast_possible_truncation, reason = "NUM_BUCKETS is 256, fits u64 modulo")] + let bucket_idx = (fingerprint % NUM_BUCKETS as u64) as usize; + let writer = self.get_or_create_writer(bucket_idx)?; + writer.write_all(&SENTINEL_LINE.to_le_bytes())?; + writer.write_all(&fingerprint.to_le_bytes())?; + Ok(()) + } + + fn flush_all(&mut self) -> Result<()> { + for w in self.bucket_writers.iter_mut().flatten() { + w.flush()?; + } + Ok(()) + } +} + +// --- DiskDedup --- + +/// Monotonic counter for unique temp directory names when multiple `DiskDedup` +/// instances are created in the same process (e.g. `--per-file` mode, tests). +static INSTANCE_COUNTER: std::sync::atomic::AtomicU32 = std::sync::atomic::AtomicU32::new(0); + +/// Disk-backed deduplication coordinator for all output sinks. +pub struct DiskDedup { + sinks: [Option; SinkId::COUNT], + base_dir: PathBuf, +} + +impl DiskDedup { + /// Creates a new `DiskDedup` with bucket directories for each active sink. + /// + /// # Errors + /// + /// Returns `Err` if the temp directory cannot be created. + pub fn new(active_mask: &[bool; SinkId::COUNT]) -> Result { + let seq = INSTANCE_COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let base_dir = std::env::temp_dir().join(format!("wpawolf_dedup_{}_{seq}", std::process::id())); + std::fs::create_dir_all(&base_dir)?; + + let mut sinks: [Option; SinkId::COUNT] = Default::default(); + for (idx, active) in active_mask.iter().enumerate() { + if *active { + let sink_dir = base_dir.join(format!("sink_{idx}")); + if let Some(slot) = sinks.get_mut(idx) { + *slot = Some(DiskDedupSink::new(sink_dir)?); + } + } + } + + Ok(Self { sinks, base_dir }) + } + + /// Records a fingerprint for a line just written to `sink`. + /// + /// # Errors + /// + /// Returns `Err` on I/O failure. + pub fn record(&mut self, sink: SinkId, fingerprint: u64) -> Result<()> { + if let Some(Some(ds)) = self.sinks.get_mut(sink.as_index()) { + ds.record(fingerprint)?; + } + Ok(()) + } + + /// Flushes an in-memory `HashSet` to bucket files with sentinel line numbers. + /// + /// # Errors + /// + /// Returns `Err` on I/O failure. + pub fn flush_hashset(&mut self, sink: SinkId, set: &HashSet) -> Result<()> { + if let Some(Some(ds)) = self.sinks.get_mut(sink.as_index()) { + for &fp in set { + ds.record_sentinel(fp)?; + } + } + Ok(()) + } + + /// Runs the post-emission cleaning pass for all active sinks. + /// + /// For each sink that has bucket data: + /// 1. Flush bucket writers. + /// 2. Load buckets one at a time, sort by fingerprint, identify duplicates. + /// 3. Collect line numbers to remove (all but first occurrence of each + /// duplicate fingerprint, excluding sentinels). + /// 4. Rewrite the output file without the removal lines. + /// + /// # Errors + /// + /// Returns `Err` on I/O failure. + pub fn clean_all(&mut self, sink_path: F) -> Result<()> + where + F: Fn(SinkId) -> Option, + { + for (idx, slot) in self.sinks.iter_mut().enumerate() { + let Some(ds) = slot.as_mut() else { continue }; + ds.flush_all()?; + + let Some(sink_id) = SinkId::from_index(idx) else { continue }; + let Some(output_path) = sink_path(sink_id) else { continue }; + + let removal = build_removal_set(&ds.bucket_dir)?; + if !removal.is_empty() { + rewrite_without_lines(&output_path, &removal)?; + } + } + Ok(()) + } + + /// Deletes all bucket files and directories. + pub fn cleanup(&mut self) { + for slot in &mut self.sinks { + if let Some(mut ds) = slot.take() { + // Drop all writers before removing files. + for w in &mut ds.bucket_writers { + *w = None; + } + let _ = std::fs::remove_dir_all(&ds.bucket_dir); + } + } + let _ = std::fs::remove_dir_all(&self.base_dir); + } +} + +impl Drop for DiskDedup { + fn drop(&mut self) { + self.cleanup(); + } +} + +impl std::fmt::Debug for DiskDedup { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let active: usize = self.sinks.iter().filter(|s| s.is_some()).count(); + f.debug_struct("DiskDedup") + .field("active_sinks", &active) + .field("base_dir", &self.base_dir) + .finish_non_exhaustive() + } +} + +// --- Cleaning pass helpers --- + +/// Loads all bucket files from `bucket_dir`, identifies duplicate fingerprints, +/// and returns the set of line numbers to remove from the output file. +fn build_removal_set(bucket_dir: &Path) -> Result> { + let mut removal = HashSet::new(); + + for bucket_idx in 0..NUM_BUCKETS { + let path = bucket_dir.join(format!("b{bucket_idx:03}.bin")); + let Ok(meta) = std::fs::metadata(&path) else { continue }; + if meta.len() == 0 { + continue; + } + + let mut records = load_bucket(&path)?; + if records.is_empty() { + continue; + } + + // Sort by fingerprint, then by line number (sentinels sort last within + // a fingerprint group, but we handle them explicitly). + records.sort_unstable_by_key(|&(line, fp)| (fp, line)); + + // Walk runs of identical fingerprints. + let mut i = 0; + while i < records.len() { + let fp = records.get(i).map_or(0, |r| r.1); + let run_start = i; + while i < records.len() && records.get(i).map_or(0, |r| r.1) == fp { + i += 1; + } + let run_end = i; + + if run_end - run_start <= 1 { + continue; + } + + // Find the first non-sentinel in the run. If a sentinel exists, + // it counts as "first occurrence" (already deduped in memory). + let has_sentinel = (run_start..run_end).any(|j| records.get(j).map_or(0, |r| r.0) == SENTINEL_LINE); + + if has_sentinel { + // All non-sentinel entries are duplicates of the in-memory original. + for j in run_start..run_end { + let line = records.get(j).map_or(SENTINEL_LINE, |r| r.0); + if line != SENTINEL_LINE { + removal.insert(line); + } + } + } else { + // No sentinel -- keep the first (lowest line number), remove the rest. + for j in (run_start + 1)..run_end { + let line = records.get(j).map_or(SENTINEL_LINE, |r| r.0); + if line != SENTINEL_LINE { + removal.insert(line); + } + } + } + } + } + + Ok(removal) +} + +/// Reads all (`line_number`, fingerprint) records from a bucket file. +fn load_bucket(path: &Path) -> Result> { + let file = std::fs::File::open(path)?; + let file_len = file.metadata()?.len(); + #[allow( + clippy::cast_possible_truncation, + reason = "bucket files are small; record count fits usize on all targets" + )] + let record_count = (file_len / RECORD_SIZE as u64) as usize; + let mut reader = BufReader::with_capacity(CLEAN_BUF_CAPACITY, file); + let mut records = Vec::with_capacity(record_count); + let mut buf = [0u8; RECORD_SIZE]; + + loop { + match reader.read_exact(&mut buf) { + Ok(()) => { + let line = u64::from_le_bytes([buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]]); + let fp = u64::from_le_bytes([buf[8], buf[9], buf[10], buf[11], buf[12], buf[13], buf[14], buf[15]]); + records.push((line, fp)); + }, + Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break, + Err(e) => return Err(e.into()), + } + } + + Ok(records) +} + +/// Rewrites an output file, skipping lines whose 0-based index is in `removal`. +fn rewrite_without_lines(output_path: &Path, removal: &HashSet) -> Result<()> { + let clean_path = output_path.with_extension("wpawolf_clean"); + { + let input = std::fs::File::open(output_path)?; + let reader = BufReader::with_capacity(CLEAN_BUF_CAPACITY, input); + let output = std::fs::File::create(&clean_path)?; + let mut writer = BufWriter::with_capacity(CLEAN_BUF_CAPACITY, output); + + for (line_num, line_result) in reader.lines().enumerate() { + let line = line_result?; + #[allow(clippy::cast_possible_truncation, reason = "line count fits u64 for any real file")] + if !removal.contains(&(line_num as u64)) { + writer.write_all(line.as_bytes())?; + writer.write_all(b"\n")?; + } + } + writer.flush()?; + } + std::fs::rename(&clean_path, output_path)?; + Ok(()) +} + +// --- Unit tests --- + +#[cfg(test)] +mod tests { + #![allow( + clippy::unwrap_used, + clippy::expect_used, + clippy::indexing_slicing, + missing_docs, + clippy::wildcard_imports, + reason = "test module" + )] + + use super::*; + use crate::output::dedup::SinkId; + + fn active_mask_single() -> [bool; SinkId::COUNT] { + let mut m = [false; SinkId::COUNT]; + m[SinkId::Out22000.as_index()] = true; + m + } + + #[test] + fn disk_dedup_creates_and_cleans_up() { + let mut dd = DiskDedup::new(&active_mask_single()).unwrap(); + assert!(dd.base_dir.exists()); + dd.cleanup(); + assert!(!dd.base_dir.exists()); + } + + #[test] + fn record_and_clean_no_duplicates() { + let dir = std::env::temp_dir().join(format!("wpawolf_test_dedup_{}_a", std::process::id())); + let _ = std::fs::remove_dir_all(&dir); + + let out_path = dir.join("test.22000"); + std::fs::create_dir_all(&dir).unwrap(); + + // Write 3 unique lines to the output file. + { + let mut f = std::fs::File::create(&out_path).unwrap(); + writeln!(f, "line_a").unwrap(); + writeln!(f, "line_b").unwrap(); + writeln!(f, "line_c").unwrap(); + } + + let mut dd = DiskDedup::new(&active_mask_single()).unwrap(); + // Record 3 unique fingerprints. + dd.record(SinkId::Out22000, 100).unwrap(); + dd.record(SinkId::Out22000, 200).unwrap(); + dd.record(SinkId::Out22000, 300).unwrap(); + + dd.clean_all(|sink| if sink == SinkId::Out22000 { Some(out_path.clone()) } else { None }).unwrap(); + + let content = std::fs::read_to_string(&out_path).unwrap(); + assert_eq!(content, "line_a\nline_b\nline_c\n"); + + dd.cleanup(); + let _ = std::fs::remove_dir_all(&dir); + } + + #[test] + fn record_and_clean_with_duplicates() { + let dir = std::env::temp_dir().join(format!("wpawolf_test_dedup_{}_b", std::process::id())); + let _ = std::fs::remove_dir_all(&dir); + + let out_path = dir.join("test.22000"); + std::fs::create_dir_all(&dir).unwrap(); + + // Write 5 lines: line 0, 1, 2 are unique; line 3 duplicates line 0; line 4 duplicates line 1. + { + let mut f = std::fs::File::create(&out_path).unwrap(); + writeln!(f, "unique_a").unwrap(); + writeln!(f, "unique_b").unwrap(); + writeln!(f, "unique_c").unwrap(); + writeln!(f, "dup_of_a").unwrap(); + writeln!(f, "dup_of_b").unwrap(); + } + + let mut dd = DiskDedup::new(&active_mask_single()).unwrap(); + dd.record(SinkId::Out22000, 100).unwrap(); // line 0 + dd.record(SinkId::Out22000, 200).unwrap(); // line 1 + dd.record(SinkId::Out22000, 300).unwrap(); // line 2 + dd.record(SinkId::Out22000, 100).unwrap(); // line 3 -- dup of 0 + dd.record(SinkId::Out22000, 200).unwrap(); // line 4 -- dup of 1 + + dd.clean_all(|sink| if sink == SinkId::Out22000 { Some(out_path.clone()) } else { None }).unwrap(); + + let content = std::fs::read_to_string(&out_path).unwrap(); + assert_eq!(content, "unique_a\nunique_b\nunique_c\n"); + + dd.cleanup(); + let _ = std::fs::remove_dir_all(&dir); + } + + #[test] + fn sentinel_fingerprints_count_as_first_occurrence() { + let dir = std::env::temp_dir().join(format!("wpawolf_test_dedup_{}_c", std::process::id())); + let _ = std::fs::remove_dir_all(&dir); + + let out_path = dir.join("test.22000"); + std::fs::create_dir_all(&dir).unwrap(); + + // 2 lines in output, but fingerprint 100 was already deduped in memory (sentinel). + { + let mut f = std::fs::File::create(&out_path).unwrap(); + writeln!(f, "dup_of_sentinel").unwrap(); + writeln!(f, "unique_line").unwrap(); + } + + let mut dd = DiskDedup::new(&active_mask_single()).unwrap(); + // Flush a sentinel for fingerprint 100 (was in memory). + let mut sentinel_set = HashSet::new(); + sentinel_set.insert(100u64); + dd.flush_hashset(SinkId::Out22000, &sentinel_set).unwrap(); + // Record output lines. + dd.record(SinkId::Out22000, 100).unwrap(); // line 0 -- dup of sentinel + dd.record(SinkId::Out22000, 200).unwrap(); // line 1 -- unique + + dd.clean_all(|sink| if sink == SinkId::Out22000 { Some(out_path.clone()) } else { None }).unwrap(); + + let content = std::fs::read_to_string(&out_path).unwrap(); + assert_eq!(content, "unique_line\n", "sentinel should cause line 0 to be removed"); + + dd.cleanup(); + let _ = std::fs::remove_dir_all(&dir); + } + + #[test] + fn drop_cleans_up() { + let dd = DiskDedup::new(&active_mask_single()).unwrap(); + let base = dd.base_dir.clone(); + assert!(base.exists()); + drop(dd); + assert!(!base.exists()); + } +} diff --git a/src/output/mod.rs b/src/output/mod.rs index de552e3..39dfa15 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -10,6 +10,7 @@ pub mod dedup; pub mod device_info; +pub mod disk_dedup; pub mod hashcat; pub mod wordlists; @@ -284,6 +285,11 @@ impl HashSinks { } Ok(()) } + + /// Returns the output path for a given sink, if configured. + fn path(&self, sink: SinkId) -> Option { + self.sinks.get(sink.as_index()).and_then(Option::as_ref).map(|s| s.path.clone()) + } } /// Returns the per-AKM extended sink that accepts a given `HashType`, if any. @@ -373,11 +379,13 @@ fn build_pmkid_line(entry: &PmkidEntry, ft: Option<&FtFields>, essid: &[u8], sin fn fan_out( sinks: &mut HashSinks, dedup: &mut PerSinkDedup, + dd: &mut Option, stats: &mut OutputStats, ht: HashType, item: FanItem<'_>, ) -> Result { let candidates: [Option; 3] = [legacy_sink_for(ht), Some(extended_sink_for(ht)), Some(SinkId::OutCombined)]; + let disk_mode = dd.is_some(); // Pre-build the EAPOL body once (prefix-independent) if any sink will accept it. let eapol_body: Option> = match item { @@ -400,9 +408,16 @@ fn fan_out( let idx = sink.as_index(); let Some(slot) = sinks.sinks.get_mut(idx) else { continue }; let Some(lazy) = slot.as_mut() else { continue }; - let accepted = match item { - FanItem::Pmkid { entry, essid, .. } => dedup.check_pmkid(sink, entry, essid), - FanItem::Eapol { pair, essid, .. } => dedup.check_eapol(sink, pair, essid), + + // In disk mode: always accept (write-through), record fingerprint to buckets. + // In memory mode: use in-memory HashSet dedup gate. + let accepted = if disk_mode { + true + } else { + match item { + FanItem::Pmkid { entry, essid, .. } => dedup.check_pmkid(sink, entry, essid), + FanItem::Eapol { pair, essid, .. } => dedup.check_eapol(sink, pair, essid), + } }; if accepted { let writer = lazy.writer()?; @@ -423,6 +438,13 @@ fn fan_out( if let Some(c) = stats.lines_per_sink.get_mut(idx) { *c += 1; } + if let Some(disk) = dd.as_mut() { + let fp = match item { + FanItem::Pmkid { entry, essid, .. } => dedup::pmkid_fingerprint(entry, essid), + FanItem::Eapol { pair, essid, .. } => dedup::eapol_fingerprint(pair, essid), + }; + disk.record(sink, fp)?; + } any_written = true; } else if let Some(c) = stats.dropped_per_sink.get_mut(idx) { *c += 1; @@ -512,6 +534,7 @@ pub struct OutputContext { stats: OutputStats, dedup: PerSinkDedup, sinks: HashSinks, + disk_dedup: Option, /// APs whose hash lines we declined to emit because no ESSID was ever /// observed for them. Such lines are not crackable (hashcat needs the /// ESSID to derive the PMK), so they go to `--log` only -- nothing @@ -546,6 +569,7 @@ impl OutputContext { stats: OutputStats::default(), dedup: PerSinkDedup::new(), sinks: HashSinks::open(paths), + disk_dedup: None, unresolved_drops: HashMap::new(), timestamp_ranges: HashMap::new(), } @@ -612,6 +636,9 @@ impl OutputContext { let estimated_bytes = estimated_hashes.saturating_mul(12).saturating_mul(active_count); if mem_monitor.would_exceed(estimated_bytes) { mem_monitor.force_disk_mode(); + if self.disk_dedup.is_none() { + self.disk_dedup = Some(disk_dedup::DiskDedup::new(&active)?); + } } else { #[allow( clippy::cast_possible_truncation, @@ -651,6 +678,7 @@ impl OutputContext { let stats = &mut self.stats; let dedup = &mut self.dedup; let sinks = &mut self.sinks; + let disk_dedup = &mut self.disk_dedup; let unresolved_drops = &mut self.unresolved_drops; // --- Pipeline 1: PMKIDs (Invariant OUT-1 -- always before EAPOL pairs) --- @@ -690,7 +718,7 @@ impl OutputContext { for essid in ssids { let item = FanItem::Pmkid { entry: &entry, ft: ft_ctx, essid }; - let written = fan_out(sinks, dedup, stats, ht, item)?; + let written = fan_out(sinks, dedup, disk_dedup, stats, ht, item)?; if written { stats.pmkids_written += 1; *stats.hash_type_emitted.entry(ht).or_insert(0) += 1; @@ -713,12 +741,14 @@ impl OutputContext { struct EmitState<'a> { sinks: &'a mut HashSinks, dedup: &'a mut PerSinkDedup, + disk_dedup: &'a mut Option, stats: &'a mut OutputStats, unresolved_drops: &'a mut HashMap, first_error: Option, } - let emit_state = std::sync::Mutex::new(EmitState { sinks, dedup, stats, unresolved_drops, first_error: None }); + let emit_state = + std::sync::Mutex::new(EmitState { sinks, dedup, disk_dedup, stats, unresolved_drops, first_error: None }); let total_pairs_processed = std::sync::atomic::AtomicUsize::new(0); let nc_stats = @@ -734,7 +764,8 @@ impl OutputContext { } if any_sink { - let EmitState { sinks: s, dedup: d, stats: st, unresolved_drops: ud, first_error } = &mut *guard; + let EmitState { sinks: s, dedup: d, disk_dedup: dd, stats: st, unresolved_drops: ud, first_error } = + &mut *guard; for pair in &pairs { let Some(ht) = HashType::from_akm_and_attack(pair.akm, false) else { continue }; let ssids = @@ -758,7 +789,7 @@ impl OutputContext { for essid in ssids { let item = FanItem::Eapol { pair, ft: ft_ctx, essid }; - match fan_out(s, d, st, ht, item) { + match fan_out(s, d, dd, st, ht, item) { Ok(written) => { if written { st.pairs_written += 1; @@ -848,9 +879,16 @@ impl OutputContext { device_store: &DeviceInfoStore, logger: &mut Logger, ) -> Result { - // Flush hash writers before opening auxiliary outputs. + // Flush hash writers before the cleaning pass. self.sinks.flush_all()?; + // Run the disk dedup cleaning pass if active. This rewrites each output + // file to remove duplicate lines that were accepted in write-through mode. + if let Some(mut dd) = self.disk_dedup.take() { + let sinks_ref = &self.sinks; + dd.clean_all(|sink| sinks_ref.path(sink))?; + } + // --- Per-AP unresolved-SSID summary --- // // Timestamp ranges were captured during `emit` (so per-file mode From 598160ffffa8ee5ce5de51da7505bcebb385713e Mon Sep 17 00:00:00 2001 From: StrongWind <5987034+StrongWind1@users.noreply.github.com> Date: Thu, 28 May 2026 00:55:32 -0400 Subject: [PATCH 04/14] fix: flush disk writers before Phase 4 + PMKID disk dedup Two bugs found during forced disk-mode testing (WPAWOLF_MEM_THRESHOLD=1): 1. BufWriter flush: add_to_disk() writes through a BufWriter, but load_group()/iter() opens the file for reading independently. Records still in the BufWriter buffer were invisible to the reader, causing 124 PMKID lines to be silently lost. Fix: add flush_disk_writer() methods to both MessageStore and PmkidStore, called before Phase 4. 2. PMKID dedup in disk mode: add_to_disk() skipped the per-pair byte-equality check, allowing duplicate PMKIDs through. Fix: add a disk_seen HashMap> that tracks seen PMKID values per pair. Populated during flush_to_disk() for already-stored entries. Costs ~20 bytes per unique PMKID. Also adds WPAWOLF_MEM_THRESHOLD env var override (integer percent) for testing disk fallback without needing a machine at 80% RSS. Verified: WPAWOLF_MEM_THRESHOLD=1 produces sorted-content-identical output to the in-memory path (SHA-256 match, 0 diff lines). --- src/main.rs | 6 +++++- src/mem_monitor.rs | 9 ++++++++- src/store/messages.rs | 8 ++++++++ src/store/pmkid.rs | 23 +++++++++++++++++++++-- 4 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index 295e267..b0e97d5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -768,6 +768,8 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { } stats.anonce_m1_m3_mismatch_sessions = stats.anonce_m1_m3_mismatch_sessions.saturating_add(message_store.count_anonce_m1_m3_mismatches()); + message_store.flush_disk_writer(); + pmkid_store.flush_disk_writer(); output_ctx.emit( &message_store, &pmkid_store, @@ -918,7 +920,9 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { debug.phase_start(4, "Emit"); } - // Single-pass emit over the fully populated stores. + message_store.flush_disk_writer(); + pmkid_store.flush_disk_writer(); + output_ctx.emit( &message_store, &pmkid_store, diff --git a/src/mem_monitor.rs b/src/mem_monitor.rs index 5b62546..c73fca8 100644 --- a/src/mem_monitor.rs +++ b/src/mem_monitor.rs @@ -35,10 +35,17 @@ pub struct MemMonitor { impl MemMonitor { /// Creates a new monitor. Probes total system RAM once at init. + /// + /// Override the 80% threshold via `WPAWOLF_MEM_THRESHOLD` (integer percent, + /// e.g. `WPAWOLF_MEM_THRESHOLD=1` triggers at 1% for testing). #[must_use] pub fn new() -> Self { let total_ram = progress::total_ram_bytes(); - let threshold_bytes = total_ram / 1000 * THRESHOLD_TENTHS; + let tenths = std::env::var("WPAWOLF_MEM_THRESHOLD") + .ok() + .and_then(|s| s.parse::().ok()) + .map_or(THRESHOLD_TENTHS, |pct| pct.min(100) * 10); + let threshold_bytes = total_ram / 1000 * tenths; Self { total_ram, threshold_bytes, last_rss: 0, disk_mode: false, packets_since_check: 0 } } diff --git a/src/store/messages.rs b/src/store/messages.rs index 6c434f6..e54fac7 100644 --- a/src/store/messages.rs +++ b/src/store/messages.rs @@ -305,6 +305,14 @@ impl MessageStore { messages } + /// Flushes the disk writer buffer. Must be called before `load_group()` to + /// ensure all records written via `add_to_disk()` are readable. + pub fn flush_disk_writer(&mut self) { + if let Some(w) = &mut self.disk_writer { + let _ = w.flush(); + } + } + /// Cleans up the temp file. Called on shutdown. pub fn cleanup_disk(&mut self) { if let Some(path) = self.disk_path.take() { diff --git a/src/store/pmkid.rs b/src/store/pmkid.rs index 22d1bb8..f1c819a 100644 --- a/src/store/pmkid.rs +++ b/src/store/pmkid.rs @@ -14,7 +14,7 @@ //! observed in M1 and M2 is stored only once. See `ARCHITECTURE.md §6` for the 20-location //! catalogue. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::io::{BufReader, BufWriter, Seek, SeekFrom, Write as _}; use crate::store::disk_messages::{read_pmkid_entry, write_pmkid_entry}; @@ -63,6 +63,11 @@ struct PmkidRef { pub struct PmkidStore { groups: HashMap>, disk_index: HashMap>, + /// Per-pair seen-PMKID set for disk mode dedup. Kept in memory because + /// the 16-byte PMKID values are small (~20 bytes per entry with `HashSet` + /// overhead) and the total count is bounded by the number of unique PMKIDs + /// in the capture (typically <100K). + disk_seen: HashMap>, disk_writer: Option>, disk_path: Option, disk_offset: u64, @@ -115,13 +120,16 @@ impl PmkidStore { } fn add_to_disk(&mut self, entry: &PmkidEntry) -> bool { + let pair = MacPair::new(entry.ap, entry.sta); + if !self.disk_seen.entry(pair).or_default().insert(entry.pmkid) { + return false; + } let Some(writer) = &mut self.disk_writer else { return false; }; let Ok(written) = write_pmkid_entry(writer, entry) else { return false; }; - let pair = MacPair::new(entry.ap, entry.sta); let refs = self.disk_index.entry(pair).or_default(); refs.push(PmkidRef { offset: self.disk_offset }); self.disk_offset += u64::from(written); @@ -143,6 +151,14 @@ impl PmkidStore { Box::new(self.groups.values().flatten().cloned()) } + /// Flushes the disk writer buffer. Must be called before `iter()` in disk + /// mode to ensure all records written via `add_to_disk()` are readable. + pub fn flush_disk_writer(&mut self) { + if let Some(w) = &mut self.disk_writer { + let _ = w.flush(); + } + } + fn load_all_entries(&self) -> Vec { let Some(path) = &self.disk_path else { return Vec::new(); @@ -188,7 +204,9 @@ impl PmkidStore { let old_groups = std::mem::take(&mut self.groups); for (pair, entries) in old_groups { let mut refs = Vec::with_capacity(entries.len()); + let seen = self.disk_seen.entry(pair).or_default(); for entry in &entries { + seen.insert(entry.pmkid); let written = write_pmkid_entry(&mut writer, entry).map_err(crate::types::Error::Io)?; refs.push(PmkidRef { offset }); offset += u64::from(written); @@ -256,6 +274,7 @@ impl PmkidStore { pub fn clear(&mut self) { self.groups.clear(); self.disk_index.clear(); + self.disk_seen.clear(); } /// Coarse heap + struct-bytes estimate for `--mem-stats` reporting. From 121f6698adfd3a878e85592f7fc8540af6dee076 Mon Sep 17 00:00:00 2001 From: StrongWind <5987034+StrongWind1@users.noreply.github.com> Date: Thu, 28 May 2026 01:36:48 -0400 Subject: [PATCH 05/14] chore: update dependencies --- Cargo.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d32501d..8d3bdfc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,9 +10,9 @@ checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" [[package]] name = "aes" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66bd29a732b644c0431c6140f370d097879203d79b80c94a6747ba0872adaef8" +checksum = "f1fc76eaeac4c9164506c466d4ffdd8ec9d0c5bf57ee97177c4d8eceb3a0e138" dependencies = [ "cipher", "cpubits", @@ -332,9 +332,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.0" +version = "2.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" [[package]] name = "miniz_oxide" From 52cb657dd12781aab3d437d336537a52f8f7fb47 Mon Sep 17 00:00:00 2001 From: StrongWind <5987034+StrongWind1@users.noreply.github.com> Date: Thu, 28 May 2026 01:49:50 -0400 Subject: [PATCH 06/14] refactor: remove --per-file flag The disk-backed fallback makes --per-file unnecessary -- memory pressure is handled automatically by spilling stores to disk. Remove the flag, all per-file code paths (per-file emit loop, per-file WDS resolve, per-file MLD canonicalization), and the per-file integration test. --strict now bundles 4 filters instead of 5: --eapoltimeout=5, --rc-drift=8, --dedup-hash-combos, --nc-dedup. --- src/main.rs | 94 +++----------------------- src/pair/mod.rs | 2 +- tests/integration/nc_dedup_collapse.rs | 85 ----------------------- 3 files changed, 10 insertions(+), 171 deletions(-) diff --git a/src/main.rs b/src/main.rs index b0e97d5..2cbcf3e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -162,9 +162,9 @@ struct Cli { log: Option, // ---- Output filters ---- - /// Narrow output like hcxpcapngtool (bundle of 5 filters) + /// Narrow output like hcxpcapngtool (bundle of 4 filters) /// - /// Enables: --eapoltimeout=5, --rc-drift=8, --dedup-hash-combos, --per-file, --nc-dedup. Later flags override these defaults. + /// Enables: --eapoltimeout=5, --rc-drift=8, --dedup-hash-combos, --nc-dedup. Later flags override these defaults. #[arg(short = 's', long, help_heading = "Output filters", display_order = 20)] strict: bool, @@ -235,12 +235,6 @@ struct Cli { #[arg(short = 'q', long, help_heading = "Runtime", display_order = 31)] quiet: bool, - /// Flush stores after each input file (no cross-file pairing) - /// - /// MessageStore and PmkidStore clear per file. Bounds RSS for large corpora at the cost of cross-file pairing (< 1% hash yield drop on per-session captures). - #[arg(long = "per-file", help_heading = "Runtime", display_order = 32)] - per_file: bool, - /// Print per-store memory footprint at end of run /// /// Approximate byte counts for every long-lived store (MessageStore, PmkidStore, EssidMap, etc.), sorted descending. For OOM triage. @@ -257,10 +251,10 @@ struct Cli { /// Apply `--strict` mode's bundled defaults to a parsed CLI. /// /// `--strict` is a shortcut for a hcxpcapngtool-shape narrow output profile. It -/// turns on the five output filters that together close the volume gap against +/// turns on the four output filters that together close the volume gap against /// hcxpcapngtool default (`--eapoltimeout=5`, `--rc-drift=8`, -/// `--dedup-hash-combos`, `--per-file`, `--nc-dedup`), but uses later-flag-wins -/// precedence so an explicit `--eapoltimeout=30` survives past `--strict`. The three boolean +/// `--dedup-hash-combos`, `--nc-dedup`), but uses later-flag-wins precedence so +/// an explicit `--eapoltimeout=30` survives past `--strict`. The two boolean /// flags can only be turned on, never off, so `--strict` always sets them. const fn apply_strict_defaults(cli: &mut Cli) { if !cli.strict { @@ -273,7 +267,6 @@ const fn apply_strict_defaults(cli: &mut Cli) { cli.rc_drift = Some(8); } cli.dedup_hash_combos = true; - cli.per_file = true; cli.nc_dedup = true; } @@ -737,53 +730,6 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { } } } - - // --- Per-file emit (--per-file mode only) --- - // - // Resolve any deferred WDS frames seen this file (they need an ESSID - // context; `essid_map` accumulates across files so even cross-file - // ESSID-based resolution still works), MLD-canonicalize the per-file - // stores, emit hashes for what we have, then drop the per-file EAPOL - // and PMKID state. Auxiliaries (`-E`/`-W`/...), `essid_map`, - // `akm_map`, `mld_store`, and the dedup state inside `output_ctx` - // accumulate across files. See `ARCHITECTURE.md §3` for the - // cross-file pairing tradeoff. - if cli.per_file { - if !pending_eapol.is_empty() { - resolve_wds_eapol( - &pending_eapol, - &essid_map, - &mut akm_map, - &mut message_store, - &mut pmkid_store, - &mut stats, - &mut logger, - ); - pending_eapol.clear(); - } - if !mld_store.is_empty() { - let merged = message_store.canonicalize_pairs(|m| mld_store.canonicalize(m)); - stats.mld_groups_merged = stats.mld_groups_merged.saturating_add(merged); - pmkid_store.canonicalize_pairs(|m| mld_store.canonicalize(m)); - } - stats.anonce_m1_m3_mismatch_sessions = - stats.anonce_m1_m3_mismatch_sessions.saturating_add(message_store.count_anonce_m1_m3_mismatches()); - message_store.flush_disk_writer(); - pmkid_store.flush_disk_writer(); - output_ctx.emit( - &message_store, - &pmkid_store, - &essid_map, - &akm_map, - &pair_config, - thread_count, - essid_filter, - &debug, - &mut mem_monitor, - )?; - message_store.clear(); - pmkid_store.clear(); - } } // Final progress line at the end of Phase 1 so an operator always sees the @@ -794,11 +740,10 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { progress.print_now(stats.total_packets, stats.input_file_count, eapol_total, stats.pmkids_found); } - // --- Phase 1.5: Resolve deferred WDS EAPOL frames (non-per-file mode only) --- + // --- Phase 1.5: Resolve deferred WDS EAPOL frames --- // WDS relay frames had ambiguous direction during Phase 1. Now that essid_map is fully // populated, resolve them using essid_map lookup, ACK-based AP discovery, or flag fallback. - // In `--per-file` mode the resolve already ran per-file inside the ingest loop. - if !cli.per_file && !pending_eapol.is_empty() { + if !pending_eapol.is_empty() { let wds_count = pending_eapol.len(); resolve_wds_eapol( &pending_eapol, @@ -836,15 +781,7 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { stats.username_list_path = path_str(&cli.username_output); stats.device_info_path = path_str(&cli.device_output); - if cli.per_file { - // Per-file mode also re-canonicalizes essid_map at end of run because - // some link-MAC SSIDs may have been filed under their pre-MLD address - // before the corresponding MLE was learned. Cheap because it only - // touches the AP-keyed map. - if !mld_store.is_empty() { - stats.essid_link_macs_merged = essid_map.canonicalize_pairs(|m| mld_store.canonicalize(m)); - } - } else { + { // 802.11be MLD canonicalization: if any Multi-Link Element was seen, rewrite all // MessageStore and PmkidStore keys so link addresses collapse onto the MLD identity. // When no MLE was observed, this is a no-op and byte-identical to pre-MLE behavior. @@ -853,15 +790,9 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { let merged = message_store.canonicalize_pairs(|m| mld_store.canonicalize(m)); stats.mld_groups_merged = merged; pmkid_store.canonicalize_pairs(|m| mld_store.canonicalize(m)); - // Fold link-MAC SSIDs into the canonical MLD MAC so essid_map lookups by - // canonical AP key (post-canonicalization on the pair side) actually find - // them. Without this, hidden-SSID resolution silently fails for any MLD - // AP whose SSID was advertised under a band-specific link MAC. stats.essid_link_macs_merged = essid_map.canonicalize_pairs(|m| mld_store.canonicalize(m)); } - // Capture-quality diagnostic: count sessions whose M1 and M3 ANonce disagree. - // Per IEEE 802.11-2024 §12.7.6.4 they must match in the same handshake session. stats.anonce_m1_m3_mismatch_sessions = message_store.count_anonce_m1_m3_mismatches(); // Phase 1 complete; log the full store state and the top heavy groups before Phase 4. @@ -1061,7 +992,6 @@ mod tests { assert_eq!(cli.eapoltimeout, None, "no --strict -> eapoltimeout stays None (unlimited)"); assert_eq!(cli.rc_drift, None, "no --strict -> rc_drift stays None (off)"); assert!(!cli.dedup_hash_combos, "no --strict -> dedup_hash_combos stays off"); - assert!(!cli.per_file, "no --strict -> per_file stays off"); assert!(!cli.nc_dedup, "no --strict -> nc_dedup stays off"); } @@ -1072,7 +1002,6 @@ mod tests { assert_eq!(cli.eapoltimeout, Some(5), "--strict -> 5 s session window"); assert_eq!(cli.rc_drift, Some(8), "--strict -> RC drift tolerance 8"); assert!(cli.dedup_hash_combos, "--strict -> dedup_hash_combos on"); - assert!(cli.per_file, "--strict -> per_file on"); assert!(cli.nc_dedup, "--strict -> nc_dedup on"); } @@ -1084,7 +1013,6 @@ mod tests { assert_eq!(cli.eapoltimeout, Some(30), "explicit user value must override --strict default"); assert_eq!(cli.rc_drift, Some(8), "untouched filters still take strict defaults"); assert!(cli.dedup_hash_combos); - assert!(cli.per_file); assert!(cli.nc_dedup); } @@ -1094,7 +1022,6 @@ mod tests { assert_eq!(cli.rc_drift, Some(4), "explicit --rc-drift=4 wins over strict's 8"); assert_eq!(cli.eapoltimeout, Some(5)); assert!(cli.dedup_hash_combos); - assert!(cli.per_file); assert!(cli.nc_dedup); } @@ -1104,18 +1031,15 @@ mod tests { assert_eq!(cli.eapoltimeout, Some(60)); assert_eq!(cli.rc_drift, Some(2)); assert!(cli.dedup_hash_combos, "strict still enables the three boolean filters"); - assert!(cli.per_file); assert!(cli.nc_dedup); } #[test] fn strict_idempotent_with_already_set_bools() { - // --strict --per-file --dedup-hash-combos --nc-dedup is the same as --strict alone. - let cli = parse_with_strict(&["--strict", "--per-file", "--dedup-hash-combos", "--nc-dedup"]); + let cli = parse_with_strict(&["--strict", "--dedup-hash-combos", "--nc-dedup"]); assert_eq!(cli.eapoltimeout, Some(5)); assert_eq!(cli.rc_drift, Some(8)); assert!(cli.dedup_hash_combos); - assert!(cli.per_file); assert!(cli.nc_dedup); } diff --git a/src/pair/mod.rs b/src/pair/mod.rs index c3d817b..d51b963 100644 --- a/src/pair/mod.rs +++ b/src/pair/mod.rs @@ -208,7 +208,7 @@ where let rss_mib = crate::progress::current_rss_mib().unwrap_or(0); let total_mib = crate::progress::total_ram_bytes() / (1024 * 1024); println!( - "error: approaching OOM -- RSS {rss_mib} MiB / {total_mib} MiB (>= 80%) during Phase 4 pairing. Reduce input size, use --per-file, or increase available RAM." + "error: approaching OOM -- RSS {rss_mib} MiB / {total_mib} MiB (>= 80%) during Phase 4 pairing. Reduce input size or increase available RAM." ); std::process::exit(1); } diff --git a/tests/integration/nc_dedup_collapse.rs b/tests/integration/nc_dedup_collapse.rs index c04447f..24b58a9 100644 --- a/tests/integration/nc_dedup_collapse.rs +++ b/tests/integration/nc_dedup_collapse.rs @@ -252,40 +252,6 @@ fn fixture_paths(name: &str) -> (PathBuf, PathBuf) { /// Beacon + 9 M1 frames whose `ANonce` tails cycle through nine consecutive /// values starting at `tail_start` + a single M2 to pair against. /// -/// At `--nc-tolerance=8` the 9-element span-8 cluster always collapses to -/// exactly one survivor under the safest-survivor rule (median is the -/// hashcat-safest observation for a dense cluster). -fn build_single_cluster_pcap(ap: [u8; 6], sta: [u8; 6], ssid: &[u8], tail_start: u8) -> Vec { - let anonce_prefix: [u8; 28] = [ - 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, - 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, - ]; - let snonce: [u8; 32] = [ - 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, - 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, - ]; - let mic = [0x10, 0x21, 0x32, 0x43, 0x54, 0x65, 0x76, 0x87, 0x98, 0xA9, 0xBA, 0xCB, 0xDC, 0xED, 0xFE, 0x0F]; - - let mut buf = common::pcap_global_header().to_vec(); - let mut ts: u32 = 1_000_000; - buf.extend_from_slice(&common::pcap_packet(ts, &beacon_wpa2_psk(ssid, ap))); - ts += 1; - for offset in 0u8..9 { - let mut nonce = [0u8; 32]; - nonce[..28].copy_from_slice(&anonce_prefix); - nonce[28] = 0x00; - nonce[29] = 0x00; - nonce[30] = 0x00; - nonce[31] = tail_start.wrapping_add(offset); - let m1_body = eapol_key_body(true, false, false, false, nonce, [0u8; 16]); - buf.extend_from_slice(&common::pcap_packet(ts, &data_frame_dl(ap, sta, &m1_body))); - ts += 1; - } - let m2_body = eapol_key_body(false, false, true, false, snonce, mic); - buf.extend_from_slice(&common::pcap_packet(ts, &data_frame_ul(ap, sta, &m2_body))); - buf -} - /// Builds a pcap with one Beacon + two M1 frames whose `ANonce` tails sit at /// the cluster span edges -- `tail=0x00` and `tail=tolerance` -- plus a /// single M2. With `--nc-tolerance=tolerance` no observed nonce can serve as @@ -368,57 +334,6 @@ fn nc_tolerance_tighter_value_splits_into_more_clusters() { assert_eq!(lines, 6, "tolerance=4 produces 5 cluster survivors + 1 isolated singleton"); } -#[test] -fn nc_dedup_per_file_counters_accumulate_across_files() { - // Regression-pin for the `OutputStats.nc_dedup_* =` bug: under `--per-file` - // each input file's `emit_inner` was overwriting the prior file's stats, - // so the closing banner only reflected the last file's NC-dedup activity. - // - // Build three pcaps, each with its own dense 9-element cluster (different - // AP MACs so the clusters do not bucket together). Run with - // `--per-file --nc-dedup` against the containing directory and assert the - // banner counters report the sum across all three files: 24 lines - // collapsed (3 * 8), 3 clusters total, max cluster size 9. - let dir = common::temp_dir("wpawolf_nc_dedup_perfile"); - let out_path = dir.join("perfile.22000"); - // Drop any pcaps left behind by a prior failing run -- directory input - // expansion would otherwise pick them up alongside the three new ones. - if let Ok(entries) = fs::read_dir(&dir) { - for entry in entries.flatten() { - let _ = fs::remove_file(entry.path()); - } - } - let files = [ - ([0x02u8, 0x11, 0x22, 0x33, 0x44, 0x01], [0x02u8, 0xAA, 0xBB, 0xCC, 0xDD, 0x01], b"WolfNetA", 0x40u8), - ([0x02u8, 0x11, 0x22, 0x33, 0x44, 0x02], [0x02u8, 0xAA, 0xBB, 0xCC, 0xDD, 0x02], b"WolfNetB", 0x60u8), - ([0x02u8, 0x11, 0x22, 0x33, 0x44, 0x03], [0x02u8, 0xAA, 0xBB, 0xCC, 0xDD, 0x03], b"WolfNetC", 0x80u8), - ]; - for (i, (ap, sta, ssid, tail)) in files.iter().enumerate() { - let pcap_bytes = build_single_cluster_pcap(*ap, *sta, *ssid, *tail); - fs::write(dir.join(format!("file{i}.pcap")), pcap_bytes).unwrap(); - } - let (lines, stdout) = run_capture(&dir, &out_path, &["--per-file", "--nc-dedup"]); - // One survivor per file -- three files -> three lines on disk. - assert_eq!(lines, 3, "each file's 9-element cluster collapses to one survivor"); - // The fix in OutputContext::emit_inner accumulates these across emit - // calls; the pre-fix value would have been 8 / 1 / 9 (last file only). - assert_eq!( - banner_counter(&stdout, "NC-dedup near-identical-nonce lines collapsed"), - 24, - "per-file mode must accumulate collapsed_lines across files (3 files * 8 collapsed each)" - ); - assert_eq!( - banner_counter(&stdout, "NC-dedup cluster count"), - 3, - "per-file mode must accumulate cluster_count across files (one cluster per file)" - ); - assert_eq!( - banner_counter(&stdout, "NC-dedup max cluster size"), - 9, - "per-file mode must take the global max across files (all three clusters are size 9)" - ); -} - #[test] fn nc_dedup_sparse_edge_cluster_does_not_collapse_via_cli() { // Regression-pin for the survivor-coverage bug: a cluster of exactly two From eef2a8e87a42f0fe6ec1f74e7939bb3583fd60b0 Mon Sep 17 00:00:00 2001 From: StrongWind <5987034+StrongWind1@users.noreply.github.com> Date: Mon, 8 Jun 2026 15:01:21 -0400 Subject: [PATCH 07/14] docs: standardize README; add example; install before quick start --- README.md | 66 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index fa2fb35..2c28e0e 100644 --- a/README.md +++ b/README.md @@ -6,28 +6,35 @@

CI - License Edition 2024 - MSRV 1.85 + MSRV 1.95 + License: Apache 2.0 +

+ +

+ Example • + Installation • + CLI reference • + Further reading

--- ## Features -- **Pure safe Rust** -- `#![forbid(unsafe_code)]`, four runtime crates (`flate2` + `clap` + `rayon` + `sysinfo`) -- **Parallel pairing** -- rayon work-stealing across CPU cores with streaming per-group fan-out -- **Wide defaults** -- emits every valid handshake; you filter at the end -- **Cross-file pairing** -- M1 in file A pairs with M2 in file B -- **20 PMKID extraction sites** -- every spec-defined location wired and counted -- **Deep frame walking** -- A-MSDU subframes, MSDU fragment reassembly, radiotap FCS strip -- **Garbage-pattern rejection** -- nonces / MICs / PMKIDs checked against five pattern classes -- **Fast** -- >=200 MB/s on NVMe; Phase 1 I/O-bound, Phase 4 CPU-parallel +- **Pure safe Rust** - `#![forbid(unsafe_code)]`, four runtime crates (`flate2` + `clap` + `rayon` + `sysinfo`) +- **Parallel pairing** - rayon work-stealing across CPU cores with streaming per-group fan-out +- **Wide defaults** - emits every valid handshake; you filter at the end +- **Cross-file pairing** - M1 in file A pairs with M2 in file B +- **20 PMKID extraction sites** - every spec-defined location wired and counted +- **Deep frame walking** - A-MSDU subframes, MSDU fragment reassembly, radiotap FCS strip +- **Garbage-pattern rejection** - nonces / MICs / PMKIDs checked against five pattern classes +- **Fast** - >=200 MB/s on NVMe; Phase 1 I/O-bound, Phase 4 CPU-parallel - **904 tests**; `make check-all` zero-warning under strict clippy --- -## Quick start +## Example ```sh wpawolf --22000-out hashes.22000 --37100-out hashes.37100 capture.pcap @@ -61,8 +68,9 @@ Download from [GitHub Releases](https://github.com/StrongWind1/WPAWolf/releases) ### From source ```sh -git clone https://github.com/StrongWind1/WPAWolf && cd WPAWolf -make build # release binary at target/release/wpawolf +git clone https://github.com/StrongWind1/WPAWolf +cd WPAWolf +make release # optimised native build -> target/release/wpawolf ``` Requires a stable Rust toolchain (see `rust-toolchain.toml`). See [`CONTRIBUTING.md`](CONTRIBUTING.md) for the full development workflow. @@ -97,9 +105,9 @@ wpawolf --22000-out hashes.22000 --eapoltimeout 3 --rc-drift 4 \ --- -## How wpawolf compares to hcxpcapngtool +## How WPAWolf compares to hcxpcapngtool -Both tools cover the same AKM scope (PSK and FT-PSK). The difference is default policy: `hcxpcapngtool` filters hard at extraction time; `wpawolf` emits everything and leaves filtering to you. +Both tools cover the same AKM scope (PSK and FT-PSK). The difference is default policy: `hcxpcapngtool` filters hard at extraction time; WPAWolf emits everything and leaves filtering to you. | Behaviour | `hcxpcapngtool` (default) | `wpawolf` (default) | |---|---|---| @@ -123,9 +131,9 @@ Both tools cover the same AKM scope (PSK and FT-PSK). The difference is default | Flag | Categories | Cracks in hashcat today? | |---|---|---| -| `--22000-out FILE` | every non-FT hash (`WPA*01*`/`WPA*02*`) | yes -- mode 22000 | -| `--37100-out FILE` | every FT hash (`WPA*03*`/`WPA*04*`) | yes -- mode 37100 | -| `-o`, `--out FILE` | every emitted hash (`WPA*01*..*11*`, per-AKM format) | no -- needs proposed mode 22002/22003 | +| `--22000-out FILE` | every non-FT hash (`WPA*01*`/`WPA*02*`) | yes - mode 22000 | +| `--37100-out FILE` | every FT hash (`WPA*03*`/`WPA*04*`) | yes - mode 37100 | +| `-o`, `--out FILE` | every emitted hash (`WPA*01*..*11*`, per-AKM format) | no - needs proposed mode 22002/22003 | | `--wpa1-out FILE` | category 1 | no | | `--wpa2-out FILE` | categories 2 + 3 | no | | `--psk-sha256-out FILE` | categories 4 + 5 | no | @@ -198,13 +206,25 @@ See [`CONTRIBUTING.md`](CONTRIBUTING.md) for the development workflow, parity or ## Credits -wpawolf is a ground-up rewrite of [ZerBea/hcxtools](https://github.com/ZerBea/hcxtools)' `hcxpcapngtool`. The reference C implementation and its two custom variants informed every design decision in this project. +WPAWolf is a ground-up rewrite of [ZerBea/hcxtools](https://github.com/ZerBea/hcxtools)' `hcxpcapngtool`. The reference C implementation and its two custom variants informed every design decision in this project. --- -## License +## Related tools -Apache 2.0. See [`LICENSE`](LICENSE). +Other projects in this collection: + +- [WiFi_Cracking](https://github.com/StrongWind1/WiFi_Cracking) - IEEE 802.11 security reference and attack guide +- [NFSWolf](https://github.com/StrongWind1/NFSWolf) - native NFS security toolkit + +--- + +## Disclaimer + +WPAWolf operates on pcap files you already have on disk. It does not capture traffic, inject frames, or touch a radio. It is intended for authorized security research only; running it on captures you do not own or lack written authorization to analyze is illegal in most jurisdictions. The authors are not responsible for any misuse or damage caused by this tool. + +--- + +## License -> [!IMPORTANT] -> `wpawolf` operates on pcap files you already have on disk. It does not capture traffic, inject frames, or touch a radio. Running it on captures you don't own or lack written authorization to analyse is illegal in most jurisdictions. +[Apache License 2.0](LICENSE) From 5677f2217010929a3c1f9a7e6059c92b01b53f55 Mon Sep 17 00:00:00 2001 From: StrongWind <5987034+StrongWind1@users.noreply.github.com> Date: Mon, 8 Jun 2026 15:01:24 -0400 Subject: [PATCH 08/14] build: add make release alias and set msrv check to 1.95 --- .github/workflows/ci.yml | 14 +++++++------- Makefile | 4 ++++ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 57b05ce..7b8107c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ # cargo test -- compiles and runs unit + integration tests. # cargo doc -- builds HTML API documentation from doc-comments. # clippy -- the Rust linter. Part of the standard toolchain. -# MSRV -- Minimum Supported Rust Version. wpawolf declares 1.85. +# MSRV -- Minimum Supported Rust Version. wpawolf declares 1.95. # ============================================================================= name: CI @@ -226,11 +226,11 @@ jobs: # --------------------------------------------------------------------------- # msrv -- Minimum Supported Rust Version. Cargo.toml declares - # `rust-version = "1.85"`. This job proves we haven't accidentally used a + # `rust-version = "1.95"`. This job proves we haven't accidentally used a # language feature newer than that. # --------------------------------------------------------------------------- msrv: - name: msrv (1.85) + name: msrv (1.95) runs-on: ubuntu-latest timeout-minutes: 20 steps: @@ -239,14 +239,14 @@ jobs: with: persist-credentials: false - - name: Install Rust 1.85 toolchain - # rust-toolchain.toml pins a different version for primary jobs; here + - name: Install Rust 1.95 toolchain + # rust-toolchain.toml uses stable (latest) for primary jobs; here # we install a second toolchain side-by-side and make it the default # for this job only. --profile minimal skips extras. run: | set -euo pipefail - rustup toolchain install 1.85.0 --profile minimal --component clippy - rustup default 1.85.0 + rustup toolchain install 1.95.0 --profile minimal --component clippy + rustup default 1.95.0 - name: Cache cargo build artefacts uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 diff --git a/Makefile b/Makefile index 780bd47..e029286 100644 --- a/Makefile +++ b/Makefile @@ -80,6 +80,10 @@ dev: build: $(CARGO) build --profile release --all-features +# Alias so `make release` builds the native release binary. +.PHONY: release +release: build + # Fast type-check, no codegen. check: $(CARGO) check --all-targets --all-features From 47579291e06f83ea7d33f0ba5043218201cfdffa Mon Sep 17 00:00:00 2001 From: StrongWind <5987034+StrongWind1@users.noreply.github.com> Date: Tue, 9 Jun 2026 15:40:49 -0400 Subject: [PATCH 09/14] chore: update dependencies --- Cargo.lock | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8d3bdfc..0dd21c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -71,9 +71,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.11.1" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" [[package]] name = "block-buffer" @@ -152,9 +152,9 @@ dependencies = [ [[package]] name = "cmov" -version = "0.5.3" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f88a43d011fc4a6876cb7344703e297c71dda42494fee094d5f7c76bf13f746" +checksum = "0c9ea0ac24bc397ab3c98583a3c9ba74fa56b09a4449bbe172b9b1ddb016027a" [[package]] name = "colorchoice" @@ -513,9 +513,9 @@ dependencies = [ [[package]] name = "sysinfo" -version = "0.39.2" +version = "0.39.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14311e7e9a03114cd4b65eedd54e8fed2945e17f08586ae97ef53bc0669f9581" +checksum = "21d0d938c10fcda3e897e28aaddf4ab462375d411f4378cd63b1c945f69aba96" dependencies = [ "libc", "memchr", @@ -528,9 +528,9 @@ dependencies = [ [[package]] name = "typenum" -version = "1.20.0" +version = "1.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" [[package]] name = "unicode-ident" From 01516a7bae87f7c3995526f5e8dc3ffaf6350cfe Mon Sep 17 00:00:00 2001 From: StrongWind <5987034+StrongWind1@users.noreply.github.com> Date: Wed, 10 Jun 2026 17:02:53 -0400 Subject: [PATCH 10/14] fix: keep link-keyed handshakes when canonicalizing MLD addresses MessageStore, PmkidStore, and EssidMap canonicalization rewrote each link-MAC group/entry onto the MLD MAC and discarded the original. A single-link association to one BSSID of an 802.11be MLD derives its PTK under the link MAC, so the rewritten MLD-keyed line could not crack. Make canonicalization additive: keep the link-keyed form and add an MLD-keyed copy so both reach output and the crackable one is always present, in memory and disk modes. Not rebuilding EssidMap also preserves the per-SSID observation counts, which restores the --essid-collapse filter that the old mem::take rebuild defeated by resetting every count to 1. --- src/store/essid.rs | 66 +++++++++++++++++++------------ src/store/messages.rs | 92 +++++++++++++++++++++++++++++-------------- src/store/pmkid.rs | 66 ++++++++++++++++++++----------- 3 files changed, 145 insertions(+), 79 deletions(-) diff --git a/src/store/essid.rs b/src/store/essid.rs index 4c77f2e..a46fb7f 100644 --- a/src/store/essid.rs +++ b/src/store/essid.rs @@ -287,23 +287,36 @@ impl EssidMap { where F: FnMut(MacAddr) -> MacAddr, { - let old_map = std::mem::take(&mut self.map); - let mut merged_link_macs: u64 = 0; - for (ap, entries) in old_map { - let canon = canonicalize(ap); - if canon != ap { - merged_link_macs += 1; - } - for entry in entries { - // Reuse the same dedup-by-bytes / earliest-timestamp semantics as - // `insert`. The hcx-essid filter was already applied at original - // insert time, so re-checking here is a no-op for accepted entries. - // The Arc<[u8]> already lives in the interner; `insert` will find - // it on the dedup path and reuse the existing entry. - self.insert(canon, &entry.essid, entry.timestamp); + // Additive: keep every link-MAC SSID entry and ALSO copy it under the + // canonical MLD MAC. Both the link-keyed and the MLD-keyed forms of a + // handshake now reach output (see `MessageStore::canonicalize_pairs`), so + // ESSID resolution must succeed for both: the link-keyed line resolves via + // the original entry, the MLD-keyed line via the copy. The prior version + // moved the entry to the MLD MAC, which left the link-keyed line with no + // SSID and dropped it as uncrackable. + // + // Not rebuilding the map is also what preserves the per-SSID `count`. The + // old `mem::take` + re-insert reset every count to 1 (re-inserting one + // entry bumps 0->1, not by its stored count), which silently defeated the + // frequency-weighted `ssids_for_emit` collapse -- it cannot pick a dominant + // SSID when every variant looks like a count-1 bit-flip. Preserving counts + // here restores that collapse, so RF-rotted SSID variants are dropped as + // designed (their crackable dominant-SSID sibling is kept). + let mut additions: Vec<(MacAddr, Arc<[u8]>, u64)> = Vec::new(); + let mut copied_link_macs: u64 = 0; + for (ap, entries) in &self.map { + let canon = canonicalize(*ap); + if canon != *ap { + copied_link_macs += 1; + for entry in entries { + additions.push((canon, Arc::clone(&entry.essid), entry.timestamp)); + } } } - merged_link_macs + for (canon, essid, timestamp) in additions { + self.insert(canon, &essid, timestamp); + } + copied_link_macs } } @@ -641,11 +654,11 @@ mod tests { } #[test] - fn canonicalize_pairs_folds_link_macs_into_mld() { - // Two link MACs (0x11, 0x22) advertising the same SSID under their own - // raw addresses get folded into one MLD canonical key (0xAA). After - // canonicalize_pairs, all_for_ap(MLD) returns the SSID; the link MAC - // entries no longer exist. + fn canonicalize_pairs_copies_link_ssids_to_mld_and_keeps_links() { + // Two link MACs (0x11, 0x22) advertising the same SSID get an MLD-keyed + // copy (0xAA) added, while the link MAC entries are KEPT. ESSID resolution + // must succeed for a handshake emitted under either the link key or the + // MLD key, so both forms must carry the SSID. let mut m = EssidMap::new(); let link_a = mac(0x11); let link_b = mac(0x22); @@ -654,16 +667,17 @@ mod tests { m.insert(link_b, b"HomeNet", 200); // 6 GHz link MAC, same SSID assert_eq!(m.ap_count(), 2); - let merged = m.canonicalize_pairs(|x| if x == link_a || x == link_b { mld } else { x }); + let copied = m.canonicalize_pairs(|x| if x == link_a || x == link_b { mld } else { x }); - assert_eq!(merged, 2, "both link MACs should have been merged"); - assert_eq!(m.ap_count(), 1, "single MLD key remains"); + assert_eq!(copied, 2, "both link MACs received an MLD copy"); + assert_eq!(m.ap_count(), 3, "two link keys kept + one MLD key added"); let entries = m.all_for_ap(&mld); - assert_eq!(entries.len(), 1, "duplicate SSID merged into one entry"); + assert_eq!(entries.len(), 1, "duplicate SSID deduped into one MLD entry"); assert_eq!(&entries[0].essid[..], b"HomeNet"); assert_eq!(entries[0].timestamp, 100, "earliest timestamp preserved"); - assert!(m.all_for_ap(&link_a).is_empty(), "link MAC entry gone after fold"); - assert!(m.all_for_ap(&link_b).is_empty(), "link MAC entry gone after fold"); + // Link entries survive so single-link lines still resolve their SSID. + assert_eq!(&m.all_for_ap(&link_a)[0].essid[..], b"HomeNet", "link_a entry kept"); + assert_eq!(&m.all_for_ap(&link_b)[0].essid[..], b"HomeNet", "link_b entry kept"); } #[test] diff --git a/src/store/messages.rs b/src/store/messages.rs index e54fac7..b3f84f4 100644 --- a/src/store/messages.rs +++ b/src/store/messages.rs @@ -347,15 +347,32 @@ impl MessageStore { size_of::() + groups_cap_bytes + msgs_bytes } - /// Rewrites every group key and embedded `(ap, sta)` addresses using `canonicalize`, - /// then merges groups that collide under the canonical key. + /// Adds an MLD-keyed copy of every group whose `(ap, sta)` canonicalizes to a + /// different pair, **keeping the original link-keyed group**. /// /// Callers typically pass a closure that looks up the MLD MAC in an `MldStore`; any /// link address unknown to the store is returned unchanged. Groups whose canonical - /// keys are already unique (the non-11be case) are preserved bit-identically. + /// keys are unchanged (the non-11be case) are left bit-identical. /// - /// Returns the number of `(AP, STA)` groups that were merged into another group as - /// a result of canonicalization -- zero when no MLD mapping changed any key. + /// Additive, not destructive: a single-link association to one BSSID of an MLD + /// derives its PTK under the **link** MAC, so the link-keyed line is the crackable + /// one and must survive; a true multi-link association derives the PTK under the + /// **MLD** MAC, so the MLD-keyed copy is the crackable one. Storing both guarantees + /// the crackable line is always emitted -- the same "emit every candidate" rule as + /// the six N#E# combos. A destructive rename (the prior behaviour) silently dropped + /// single-link handshakes on MLD-capable APs onto the MLD MAC, producing only an + /// uncrackable line. [IEEE 802.11be] §35.3 + /// + /// Dedup is unaffected: a link-keyed line and its MLD-keyed copy carry different AP + /// MACs, so they are genuinely distinct hash lines (different output fingerprint), + /// not duplicates -- emitting both is the point. Any true duplicate that the copy + /// could create (e.g. a real MLD group that already held the same message) is still + /// collapsed by the per-group inline dedup in `pair::generate` and the global + /// `SipHash` set in `output`. Verified `sort | uniq -d` empty across a multi-vendor + /// corpus after this change (FR-CORRECT-2). + /// + /// Returns the number of groups that received a canonical copy (0 when no MLD + /// mapping changed any key). pub fn canonicalize_pairs(&mut self, mut canonicalize: F) -> u64 where F: FnMut(MacAddr) -> MacAddr, @@ -363,35 +380,42 @@ impl MessageStore { if self.disk_mode { return self.canonicalize_pairs_disk(&mut canonicalize); } - let old = std::mem::take(&mut self.groups); - let old_group_count = old.len(); - let old_total = self.total_count; - self.total_count = 0; - for (pair, mut msgs) in old { - let canon_ap = canonicalize(pair.ap); - let canon_sta = canonicalize(pair.sta); - let canon_pair = MacPair::new(canon_ap, canon_sta); + // Collect the MLD-keyed copies first so we do not mutate `groups` while + // iterating it. Only groups whose canonical key differs get a copy. + let mut additions: Vec<(MacPair, Vec)> = Vec::new(); + for (pair, msgs) in &self.groups { + let canon_pair = MacPair::new(canonicalize(pair.ap), canonicalize(pair.sta)); + if canon_pair != *pair { + additions.push((canon_pair, msgs.clone())); + } + } + let canonicalized = additions.len() as u64; + for (canon_pair, mut msgs) in additions { self.total_count += msgs.len(); self.groups.entry(canon_pair).or_default().append(&mut msgs); } - debug_assert_eq!(self.total_count, old_total, "canonicalization must not drop messages"); - (old_group_count as u64).saturating_sub(self.groups.len() as u64) + canonicalized } - /// Disk-mode canonicalization: rewrite index keys without loading message data. + /// Disk-mode canonicalization: add MLD-keyed index entries without loading + /// message data, keeping the original link-keyed entries. The copied refs point + /// at the same on-disk records, so both keys reconstruct the same messages. fn canonicalize_pairs_disk(&mut self, canonicalize: &mut F) -> u64 where F: FnMut(MacAddr) -> MacAddr, { - let old_index = std::mem::take(&mut self.disk_index); - let old_group_count = old_index.len(); - for (pair, refs) in old_index { - let canon_ap = canonicalize(pair.ap); - let canon_sta = canonicalize(pair.sta); - let canon_pair = MacPair::new(canon_ap, canon_sta); + let mut additions: Vec<(MacPair, Vec)> = Vec::new(); + for (pair, refs) in &self.disk_index { + let canon_pair = MacPair::new(canonicalize(pair.ap), canonicalize(pair.sta)); + if canon_pair != *pair { + additions.push((canon_pair, refs.clone())); + } + } + let canonicalized = additions.len() as u64; + for (canon_pair, refs) in additions { self.disk_index.entry(canon_pair).or_default().extend(refs); } - (old_group_count as u64).saturating_sub(self.disk_index.len() as u64) + canonicalized } /// Folds the earliest and latest message timestamps for every AP MAC in @@ -766,9 +790,12 @@ mod tests { } #[test] - fn canonicalize_pairs_merges_via_mld() { + fn canonicalize_pairs_adds_mld_copy_and_keeps_link_groups() { // Two distinct (AP, STA) groups whose STA addresses both canonicalize to the - // same MLD MAC -> they should merge into a single group after canonicalization. + // same MLD MAC. Additive canonicalization keeps both link-keyed groups AND + // adds an MLD-keyed group that gathers both link copies -- so a true + // multi-link handshake can pair across links (the MLD group has M1 + M3) + // while a single-link handshake's link-keyed line still survives. let mut store = MessageStore::new(); let ap = mac(0xAA); let link_a = mac(0x11); @@ -777,12 +804,17 @@ mod tests { store.add(ap, link_a, msg_with_nonce(MsgType::M1, [0x01; 32])); store.add(ap, link_b, msg_with_nonce(MsgType::M3, [0x01; 32])); assert_eq!(store.group_count(), 2); - assert_eq!(store.total_count(), 2); - let merged = store.canonicalize_pairs(|m| if m == link_a || m == link_b { mld } else { m }); - assert_eq!(store.group_count(), 1, "two links sharing one MLD must merge"); - assert_eq!(store.total_count(), 2, "messages preserved across merge"); - assert_eq!(merged, 1, "one group was merged into another"); + let canonicalized = store.canonicalize_pairs(|m| if m == link_a || m == link_b { mld } else { m }); + assert_eq!(canonicalized, 2, "both link groups received an MLD copy"); + assert_eq!(store.group_count(), 3, "two link groups kept + one MLD group added"); + + // Originals survive (single-link crackability). + let group_lens: HashMap = store.groups().map(|(p, m)| (*p, m.len())).collect(); + assert_eq!(group_lens.get(&MacPair::new(ap, link_a)).copied(), Some(1), "link_a group kept"); + assert_eq!(group_lens.get(&MacPair::new(ap, link_b)).copied(), Some(1), "link_b group kept"); + // MLD group gathers both copies, so it can pair M1 with M3 across links. + assert_eq!(group_lens.get(&MacPair::new(ap, mld)).copied(), Some(2), "MLD group has both messages"); } #[test] diff --git a/src/store/pmkid.rs b/src/store/pmkid.rs index f1c819a..72a0581 100644 --- a/src/store/pmkid.rs +++ b/src/store/pmkid.rs @@ -229,34 +229,51 @@ impl PmkidStore { self.disk_writer = None; } - /// Rewrites every group key and embedded AP/STA addresses using `canonicalize`. + /// Adds an MLD-keyed copy of every PMKID whose AP/STA canonicalizes to a + /// different address, **keeping the original link-keyed entry**. /// - /// Groups that collide under the canonical key are merged; per-pair dedup is re-applied - /// so that an identical PMKID value observed under two link addresses is stored once. - /// Callers typically pass an `MldStore::canonicalize` closure. Bit-identical behavior - /// for non-11be captures: when no mapping changes any address, the store is unchanged. + /// Additive, not destructive, for the same reason as + /// [`MessageStore::canonicalize_pairs`](crate::store::messages::MessageStore::canonicalize_pairs): + /// a single-link PMKID is computed under the link MAC, a multi-link one under the + /// MLD MAC, and only one is crackable -- so both are stored. Per-pair dedup is + /// applied so an identical PMKID seen under two link addresses is not duplicated + /// within a group. Bit-identical for non-11be captures (no address changes). + /// Callers typically pass an `MldStore::canonicalize` closure. pub fn canonicalize_pairs(&mut self, mut canonicalize: F) where F: FnMut(MacAddr) -> MacAddr, { if self.disk_mode { - let old_index = std::mem::take(&mut self.disk_index); - for (pair, refs) in old_index { - let canon_ap = canonicalize(pair.ap); - let canon_sta = canonicalize(pair.sta); - let canon_pair = MacPair::new(canon_ap, canon_sta); + let mut additions: Vec<(MacPair, Vec)> = Vec::new(); + for (pair, refs) in &self.disk_index { + let canon_pair = MacPair::new(canonicalize(pair.ap), canonicalize(pair.sta)); + if canon_pair != *pair { + additions.push((canon_pair, refs.clone())); + } + } + for (canon_pair, refs) in additions { self.disk_index.entry(canon_pair).or_default().extend(refs); } return; } - let old = std::mem::take(&mut self.groups); - for (_pair, entries) in old { - for mut entry in entries { - entry.ap = canonicalize(entry.ap); - entry.sta = canonicalize(entry.sta); - self.add(entry); + // Collect MLD-keyed copies, then re-insert them (dedup applies). The + // originals stay in place. + let mut additions: Vec = Vec::new(); + for entries in self.groups.values() { + for entry in entries { + let canon_ap = canonicalize(entry.ap); + let canon_sta = canonicalize(entry.sta); + if canon_ap != entry.ap || canon_sta != entry.sta { + let mut copy = entry.clone(); + copy.ap = canon_ap; + copy.sta = canon_sta; + additions.push(copy); + } } } + for entry in additions { + self.add(entry); + } } /// Returns the total number of unique PMKIDs stored. @@ -425,19 +442,22 @@ mod tests { } #[test] - fn canonicalize_pairs_merges_duplicate_pmkid() { + fn canonicalize_pairs_adds_mld_copy_keeping_link_entries() { let mut store = PmkidStore::new(); store.add(make_entry(0xAA, 0x11, realistic_pmkid(0x99))); store.add(make_entry(0xAA, 0x22, realistic_pmkid(0x99))); assert_eq!(store.total_count(), 2, "before canonicalization: distinct pairs"); + let mld = MacAddr::from_bytes([0x55; 6]); store.canonicalize_pairs(|m| { - if m == MacAddr::from_bytes([0x11; 6]) || m == MacAddr::from_bytes([0x22; 6]) { - MacAddr::from_bytes([0x55; 6]) - } else { - m - } + if m == MacAddr::from_bytes([0x11; 6]) || m == MacAddr::from_bytes([0x22; 6]) { mld } else { m } }); - assert_eq!(store.total_count(), 1, "merged pair must dedupe on PMKID value"); + // Additive: both link entries kept (single-link crackability) plus ONE + // MLD-keyed copy (the two identical PMKIDs dedupe within the MLD group). + assert_eq!(store.total_count(), 3, "two link entries kept + one deduped MLD copy"); + let mld_entries = store.iter().filter(|e| e.sta == mld).count(); + assert_eq!(mld_entries, 1, "MLD pair holds one deduped PMKID"); + let link_entries = store.iter().filter(|e| e.sta != mld).count(); + assert_eq!(link_entries, 2, "both original link-keyed PMKIDs survive"); } #[test] From 8589a2e8f8dc71b55e613124448dc8d4f3058a0c Mon Sep 17 00:00:00 2001 From: StrongWind <5987034+StrongWind1@users.noreply.github.com> Date: Wed, 10 Jun 2026 17:04:07 -0400 Subject: [PATCH 11/14] docs: correct stale claims in hashcat-format references and PR template HASHCAT-CURRENT-FORMATS: the SHA-384 family is suppressed from the legacy sinks, not written with keyver=0. HASHCAT-PROPOSED-CHANGES: drop the retired .taxo example extension. PR template: real FR id example and the current five-crate runtime budget. --- .github/PULL_REQUEST_TEMPLATE.md | 6 +++--- HASHCAT-CURRENT-FORMATS.md | 12 ++++++------ HASHCAT-PROPOSED-CHANGES.md | 10 +++++----- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 65d10b8..9f830c8 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -9,7 +9,7 @@ Fill in each section below. Delete any that are not applicable. ## Related requirement / task - + ## Changes @@ -25,8 +25,8 @@ Fill in each section below. Delete any that are not applicable. ## Dependency changes +policy in CONTRIBUTING.md (`flate2`, `crc32fast`, `clap`, `rayon`, and `sysinfo` +are the entire runtime budget). Delete this section otherwise. --> ## Notes for reviewer diff --git a/HASHCAT-CURRENT-FORMATS.md b/HASHCAT-CURRENT-FORMATS.md index 8a899e9..26192fb 100644 --- a/HASHCAT-CURRENT-FORMATS.md +++ b/HASHCAT-CURRENT-FORMATS.md @@ -232,7 +232,7 @@ Hashcat's mode-22000 PMKID parser does not consume this byte (it reads the `***` ## §7 How the 11 wpawolf hash types map onto the four legacy prefixes -`wpawolf` classifies every PSK-crackable hash into one of eleven types (see [`HASHCAT-NEW-FORMATS.md`](HASHCAT-NEW-FORMATS.md) for the full classification). When the legacy sinks (`--22000-out`, `--37100-out`) are configured, each row is rewritten with a legacy prefix; this table shows what comes out and how hashcat handles it. +`wpawolf` classifies every PSK-crackable hash into one of eleven types (see [`HASHCAT-NEW-FORMATS.md`](HASHCAT-NEW-FORMATS.md) for the full classification). When the legacy sinks (`--22000-out`, `--37100-out`) are configured, each SHA-1 / SHA-256-family row is rewritten with a legacy prefix; the SHA-384 family is suppressed from the legacy sinks entirely (`legacy_sink_for` returns no sink) because its 24 B MIC would only produce `Token length exception` parse errors at hashcat startup. This table shows what comes out and how hashcat handles it. | 11-type row | Legacy prefix | Legacy sink | Hashcat reads via | Cracks today? | |-------------------------|--------------------|-----------------|-------------------|--------------------| @@ -243,12 +243,12 @@ Hashcat's mode-22000 PMKID parser does not consume this byte (it reads the `***` | PSK-SHA256-EAPOL | `WPA*02*` | `--22000-out` | `keyver=3` | yes (m22000_aux3) | | FT-PSK-PMKID | `WPA*03*` | `--37100-out` | direct prefix | yes (m37100 type=3)| | FT-PSK-EAPOL | `WPA*04*` | `--37100-out` | `keyver=3` | partial (m37100 type=4) -- N2E3 / N4E3 APLESS combos do not crack, see §8.1 | -| PSK-SHA384-PMKID | `WPA*01*` | `--22000-out` | best-effort | **no** -- kernel runs HMAC-SHA1, line is HMAC-SHA384 | -| PSK-SHA384-EAPOL | `WPA*02*` | `--22000-out` | `keyver=0` | **no** -- 24 B MIC truncated to 16 B; module rejects keyver=0 | -| FT-PSK-SHA384-PMKID | `WPA*03*` | `--37100-out` | best-effort | **no** -- kernel runs SHA-256 FT chain, line is SHA-384 | -| FT-PSK-SHA384-EAPOL | `WPA*04*` | `--37100-out` | `keyver=0` | **no** -- 24 B MIC truncated; module rejects keyver=0 | +| PSK-SHA384-PMKID | -- (suppressed) | none -- `--psk-sha384-out` / `-o` only | n/a | **no** -- HMAC-SHA384 PMKID has no kernel; line never written to a legacy sink | +| PSK-SHA384-EAPOL | -- (suppressed) | none -- `--psk-sha384-out` / `-o` only | n/a | **no** -- 24 B MIC does not fit the 16 B field; loader would reject keyver=0 | +| FT-PSK-SHA384-PMKID | -- (suppressed) | none -- `--ft-psk-sha384-out` / `-o` only | n/a | **no** -- needs FT-KDF-SHA-384 chain; line never written to a legacy sink | +| FT-PSK-SHA384-EAPOL | -- (suppressed) | none -- `--ft-psk-sha384-out` / `-o` only | n/a | **no** -- SHA-384 24 B MIC + FT chain, both unsupported | -Six of eleven rows route cleanly through the legacy scheme today. One row misroutes silently inside the kernel (PSK-SHA256-PMKID -- the line is well-formed but the cracker checks the wrong primitive). Four rows have no usable legacy path at all -- the SHA-384 family's 24 B MIC does not fit the 16 B `` field, and the module rejects `keyver=0` even before reaching the kernel. +Six of eleven rows route cleanly through the legacy scheme today. One row misroutes silently inside the kernel (PSK-SHA256-PMKID -- the line is well-formed but the cracker checks the wrong primitive). Four rows have no usable legacy path at all -- the SHA-384 family's 24 B MIC does not fit the 16 B `` field and the module rejects `keyver=0` even before reaching the kernel, so wpawolf never writes them to the legacy sinks; they surface only on the per-AKM sinks (`--psk-sha384-out`, `--ft-psk-sha384-out`) and the combined `-o` under their `WPA*08*..*11*` prefixes. --- diff --git a/HASHCAT-PROPOSED-CHANGES.md b/HASHCAT-PROPOSED-CHANGES.md index 8115366..f48e033 100644 --- a/HASHCAT-PROPOSED-CHANGES.md +++ b/HASHCAT-PROPOSED-CHANGES.md @@ -20,7 +20,7 @@ Phase 1 is shippable without any hashcat-core patch. Phase 2 unlocks the maximum 1. **One module per input semantic.** Mode 22002 takes a passphrase and runs PBKDF2; mode 22003 takes a 64-hex PMK and skips PBKDF2. The on-disk hash format and the post-PMK math are identical between them. This mirrors the relationship between today's 22000 and 22001. 2. **Type-driven dispatch.** The 2-digit prefix code after `WPA*` is the SOLE routing axis. The loader reads the type, picks the kernel, sets the MIC width, and decides whether to expect FT extras. No `keyver` byte inspection, no AKM inference, no pair-of-fields correlation. 3. **PBKDF2 reuse across all 11 types per ESSID.** A hash file containing every PSK family the operator's capture produced runs PBKDF2 *once per (ESSID, work-item)* and dispatches per-type post-PMK math from the cached PMK in `tmps[].out`. PBKDF2 is the dominant cost (4096 SHA-1 iterations); the per-type math is ~0.1% of that on mode 22002. -4. **Single-pass cracking of mixed-type files.** The natural input is a `wpawolf -o` per-AKM file containing every hash extracted from one capture. One `hashcat -m 22002 all.taxo wordlist.txt` cracks every variant. No per-type hash-file splitting, no per-mode re-runs. +4. **Single-pass cracking of mixed-type files.** The natural input is a `wpawolf -o` per-AKM file containing every hash extracted from one capture. One `hashcat -m 22002 all.hash wordlist.txt` cracks every variant. No per-type hash-file splitting, no per-mode re-runs. 5. **Greenfield format consumption.** New format only. The new modules never see a legacy line. This eliminates entire categories of loader complexity (the `keyver` peek, the AKM-from-`WPA*01*` guessing problem, the HCCAPX binary path). 6. **Two-phase implementation.** Phase 1 is a self-contained ship target requiring no hashcat-core changes. Phase 2 is an independently reviewable hashcat-core patch plus a kernel-layout refactor. Operators see no CLI or hash-format change between phases. @@ -199,8 +199,8 @@ The branch is a single `switch` at the top of the kernel body. Inside each case Inside one wavefront, GPU lanes that take different `switch` arms run serialised. Realistic mix: -- Operator runs `wpawolf -o all.taxo` and feeds it to `hashcat -m 22002`. -- `all.taxo` carries one `WPA**` line per detected handshake; one capture often has many (PMKID + 3 EAPOL pair combos per session). +- Operator runs `wpawolf -o all.hash` and feeds it to `hashcat -m 22002`. +- `all.hash` carries one `WPA**` line per detected handshake; one capture often has many (PMKID + 3 EAPOL pair combos per session). - Within a single salt (ESSID), every type the capture produced shares the same PBKDF2 output; the host buckets digests by salt before launching aux kernels. - The host launches `m22002_aux2` once per (salt, work-item-batch) with all type-4 + type-5 digests for that salt visible. A wavefront iterating digest_pos sees mixed type 4 / type 5 lanes -> ~2x divergence cost on the post-PMK math (cheap relative to PBKDF2 on mode 22002, but visible on mode 22003). @@ -375,8 +375,8 @@ The migration from Phase 1 to Phase 2 is invisible to operators: same mode numbe A test fixture covers all eleven types. For each: ``` -hashcat -m 22002 fixtures/typeNN.taxo wordlist.txt -hashcat -m 22003 fixtures/typeNN.taxo pmk_list.txt +hashcat -m 22002 fixtures/typeNN.hash wordlist.txt +hashcat -m 22003 fixtures/typeNN.hash pmk_list.txt # expected: cracks the fixture password / matches the fixture PMK ``` From c41d676411d6804f272bbb500ad25d4622a8f591 Mon Sep 17 00:00:00 2001 From: StrongWind <5987034+StrongWind1@users.noreply.github.com> Date: Wed, 10 Jun 2026 17:05:30 -0400 Subject: [PATCH 12/14] feat: rebuild the stats banner as a checked per-line contract Render the closing summary via Stats::summary_string at a fixed W=60 value column with a 58-char label cap, four-class drop/recover/diagnostic/ informational suffixes, and per-phase sections. Add the counters needed to account for every packet and every drop: unknown link type, post-header truncation, reserved management subtype, malformed mesh control, out-of-band channel, the --eapoltimeout/--rc-drift pair-filter drops, and the two emit-time drops (unclassified AKM, missing FT context). Surface run context and cost: active output filters, per-phase wallclock, throughput, peak RSS, disk-mode, and a one-line zero-hash hint. Enforce packet-accounting identity 1 (total packets == sum of terminal dispositions) with a banner BUG row, a pipeline debug_assert, and an integration test over the generated fixture corpus. Move the full per-line catalogue (label, field, spec source, reason, drop behaviour) into STATS.md and gate it with make audit-stats (tools/audit_stats.sh), wired into check-all; trim ARCHITECTURE.md section 9 to a pointer. --- .gitignore | 6 +- ARCHITECTURE.md | 223 ++--- Makefile | 11 +- STATS.md | 256 ++++++ src/extract/beacon.rs | 11 +- src/extract/data.rs | 23 +- src/extract/mgmt.rs | 28 +- src/input/mod.rs | 2 + src/log.rs | 6 +- src/main.rs | 132 ++- src/mem_monitor.rs | 21 +- src/output/mod.rs | 84 +- src/pair/combos.rs | 140 ++- src/pair/mod.rs | 16 +- src/pair/nc_dedup.rs | 7 + src/stats.rs | 873 ++++++++++++++++--- src/store/essid.rs | 11 + tests/integration/generated_corpus.rs | 22 + tests/integration/log_categories_coverage.rs | 6 +- tools/audit_stats.sh | 74 ++ 20 files changed, 1558 insertions(+), 394 deletions(-) create mode 100644 STATS.md create mode 100755 tools/audit_stats.sh diff --git a/.gitignore b/.gitignore index c1a0377..170eaf8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,11 +4,13 @@ docs/ # Top-level tools/ holds local-only scratch space. The wpawolf-fixturegen # crate (a workspace member, see tools/fixturegen/Cargo.toml) is committed. -# tools/audit_citations.sh is the developer-side parity-citation audit -# (skips cleanly when ref/ is missing) and is also committed. +# tools/audit_citations.sh (parity-citation audit; skips cleanly when ref/ is +# missing) and tools/audit_stats.sh (STATS.md vs stats.rs drift gate) are +# also committed. tools/* !tools/fixturegen/ !tools/audit_citations.sh +!tools/audit_stats.sh # ============================================================================= # .gitignore -- Rust/Cargo / VS Code / Linux / macOS / Windows diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 5fd3f83..2b32b13 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -60,13 +60,13 @@ The eleven canonical names (used verbatim in stats, source code, and output line **See [`HASHCAT-NEW-FORMATS.md`](HASHCAT-NEW-FORMATS.md) for the deep dive:** the encoding rules, per-type cracker math (PBKDF2 -> PMK -> PMKID / PTK / MIC paths), the differential view between adjacent rows, the shared-subtree map a cracker can cache, the complete hash-line field layout including the 24 B MIC SHA-384 split, the full message-pair byte specification (combo discriminant + APLESS / NC / LE / BE flag bits, plus the separate PMKID-line PMKID_AP / PMKID_CLIENT / PMKID_APPSK256 byte values), and the N#E# vs M#E# notation translation table. -For how the 11 types currently route through hashcat modes 22000 and 37100 (legacy four-prefix scheme, the `keyver` trick, support matrix per row), see [`HASHCAT-CURRENT-FORMATS.md`](HASHCAT-CURRENT-FORMATS.md). For a sketch of a unified hashcat module (mode 22001) that consumes all 11 types, see [`HASHCAT-PROPOSED-CHANGES.md`](HASHCAT-PROPOSED-CHANGES.md). +For how the 11 types currently route through hashcat modes 22000 and 37100 (legacy four-prefix scheme, the `keyver` trick, support matrix per row), see [`HASHCAT-CURRENT-FORMATS.md`](HASHCAT-CURRENT-FORMATS.md). For a sketch of two new hashcat modes (22002 passphrase-side, 22003 PMK-side) that consume all 11 types, see [`HASHCAT-PROPOSED-CHANGES.md`](HASHCAT-PROPOSED-CHANGES.md). For how `wpawolf` writes lines and which CLI flags route hashes to which sink, see [`README.md`](README.md). ### §2.2 Where the deep detail lives -The deep per-type detail (PBKDF2 shared foundation, per-type post-PMK computation, hash-line format with field widths, differential view between adjacent rows, shared-subtree overlap map, and the complete message-pair byte specification including PMKID-line semantics) lives in [`HASHCAT-NEW-FORMATS.md`](HASHCAT-NEW-FORMATS.md). How those 11 types are reached through current hashcat (modes 22000 + 37100, the four legacy prefixes, the `keyver` trick, per-row support matrix) is in [`HASHCAT-CURRENT-FORMATS.md`](HASHCAT-CURRENT-FORMATS.md). The unified-module sketch (mode 22001) for a future kernel that consumes all 11 types is in [`HASHCAT-PROPOSED-CHANGES.md`](HASHCAT-PROPOSED-CHANGES.md). The operator-facing CLI / output-sink reference lives in [`README.md`](README.md). This document focuses on `wpawolf`'s architecture decisions only. +The deep per-type detail (PBKDF2 shared foundation, per-type post-PMK computation, hash-line format with field widths, differential view between adjacent rows, shared-subtree overlap map, and the complete message-pair byte specification including PMKID-line semantics) lives in [`HASHCAT-NEW-FORMATS.md`](HASHCAT-NEW-FORMATS.md). How those 11 types are reached through current hashcat (modes 22000 + 37100, the four legacy prefixes, the `keyver` trick, per-row support matrix) is in [`HASHCAT-CURRENT-FORMATS.md`](HASHCAT-CURRENT-FORMATS.md). The new-modules sketch (22002 / 22003) for future kernels that consume all 11 types is in [`HASHCAT-PROPOSED-CHANGES.md`](HASHCAT-PROPOSED-CHANGES.md). The operator-facing CLI / output-sink reference lives in [`README.md`](README.md). This document focuses on `wpawolf`'s architecture decisions only. | Looking for... | Read this | |------------------------------------------------------|----------------------------------------------| @@ -166,7 +166,7 @@ One block / record at a time. I/O buffer 64 KiB (FR-MEM-2). EOF mid-record logs `src/link/` strips the radio metadata header; `src/ieee80211/` parses 802.11 frames and tagged parameters. -`src/link/`: `radiotap.rs` (DLT 127, LE, variable `it_len`, multi-word `it_present`); `ppi.rs` (DLT 192, `pph_dlt` must be 105); `prism.rs` (DLT 119, host byte order, AVS-within-Prism detection via BE magic `0x80211xxx`); `avs.rs` (DLT 163, BE per spec - hcxtools treats as LE which is a documented bug we refuse to replicate, with the deviation commented at the parse site per the project's wire-spec convention). +`src/link/`: `radiotap.rs` (DLT 127, LE, variable `it_len`, multi-word `it_present`; non-zero `it_version` is forgiven and counted, not dropped); `ppi.rs` (DLT 192, `pph_dlt` must be 105); `prism.rs` (DLT 119, host byte order, AVS-within-Prism detection via BE magic `0x80211xxx`); `avs.rs` (DLT 163, BE per spec - hcxtools treats as LE which is a documented bug we refuse to replicate, with the deviation commented at the parse site per the project's wire-spec convention); `sll.rs` (DLT 113 SLL / DLT 276 SLL2, Linux cooked capture, ARPHRD 801 raw / 802 Prism / 803 radiotap dispatch); `fcs.rs` (per-packet CRC-32 FCS resolve via the `0x2144DF1C` residue check, five counted outcomes); `recover.rs` (tiered recovery for corrupt link-layer headers: Tier 2 recomputes the radiotap length from `it_present`, Tier 3 scans for the CRC-32 residue). `src/ieee80211/`: `frame.rs` (MAC header per `[IEEE 802.11-2024]` §9.2.4.1, address mapping Table 9-60, WDS first-class per §4 invariant 4); `ie.rs` (IE TLV walker for SSID, SSID List tag 84, Mesh ID tag 114, Country, vendor AP names, OWE Transition Mode, CCX1, WPS); `rsn.rs` (RSN IE tag 48: version, group cipher, pairwise list, AKM list, RSN caps, PMKID list, group management cipher per §9.4.2.24); `ft.rs` (MDE tag 54 for MDID, FTE tag 55 subelement 3 for R0KH-ID 1-48 B, subelement 1 for R1KH-ID 6 B per §9.4.2.45, §9.4.2.46); `eapol.rs` (EAPOL-Key per §12.7.2, M1/M2/M3/M4 from Key Information bits per Table 12-10, KDV validation per Table 12-11); `eap.rs` (EAP per RFC 3748 §4 - identity Type 1 and inner-method username); `amsdu.rs` (A-MSDU subframe iteration per §9.3.2.2.2); `anqp.rs` (ANQP element parsing for venue / domain / NAI realm extraction). @@ -189,7 +189,7 @@ match frame.subtype { Action { cat: 15, .. } => extract_mesh_peering_pmkid(frame), // S18/S19 Data EAPOL-Key M1..M4 => extract_eapol_msg_and_pmkid(frame), // S1/S2 Data EAP => extract_eap_identity_username(frame), - _ => stats.skipped += 1, + _ => {}, // per-subtype counter only, no extraction } ``` @@ -279,7 +279,7 @@ These are the non-negotiable rules of the codebase. Violating any of them is a r ### 2. Collect-then-pair (no stream pairing, no eviction) -All EAPOL messages for an `(AP, STA)` pair go into `HashMap>` first. Pairing runs in §3.4 on the complete per-group message set. There is no ring buffer, no eviction, and no per-type message cap. If RSS exceeds 80 % of system RAM during Phase 1 ingestion or Phase 4 pairing, the process aborts with a clear "approaching OOM" message. Use `--per-file` to bound memory on large corpora. +All EAPOL messages for an `(AP, STA)` pair go into `HashMap>` first. Pairing runs in §3.4 on the complete per-group message set. There is no ring buffer, no eviction, and no per-type message cap. If RSS reaches 80 % of system RAM (override via `WPAWOLF_MEM_THRESHOLD`) during Phase 1 ingestion or Phase 4 emission, `MemMonitor` sets a sticky disk-mode flag: `MessageStore` and `PmkidStore` spill to temp-file storage (`src/store/disk_messages.rs`) and Phase 4 streams groups back one at a time, while hash-line dedup falls over to partitioned fingerprint bucket files with a post-run cleaning pass (`src/output/disk_dedup.rs`). The run degrades to disk speed instead of aborting, and collect-then-pair semantics are preserved in both modes. This is the single most important architectural difference vs upstream hcxpcapngtool. Their implementation pairs on arrival using a 64-entry shared ring (`MESSAGELIST_MAX = 64`); when the 65th message arrives without a successful pair, the oldest is silently dropped. wpawolf cannot miss a valid pair regardless of message ordering or interleaving from other AP/STA pairs because pairing never runs on a partial set. @@ -293,7 +293,7 @@ Upstream drops EAPOL frames > 255 B via `EAPOL_AUTHLEN_OLD_MAX`. wpawolf emits e Upstream skips WDS frames (To DS = 1, From DS = 1) unless `--all` is passed. wpawolf always processes them. The frame parser handles all four To-DS / From-DS combinations per `[IEEE 802.11-2024]` Table 9-60. Relay frames carry valid handshakes between repeaters and upstream APs; skipping them means missing hashes. -There is no flag to opt out. WDS frames are counted in `stats.wds_count` for observability. +There is no flag to opt out. WDS frames are counted in `stats.relay_frames` for observability. ### 5. Global SipHash dedup @@ -387,11 +387,11 @@ Error policy: ### 11. Minimal dependency budget -Direct runtime dependencies: 4 crates -- `flate2` (gzip, `rust_backend`-only feature), `clap` (CLI, derive), `rayon` (parallel Phase 4 pairing via work-stealing), and `sysinfo` (cross-platform RSS + total-RAM queries for OOM detection and `--debug` memory reporting). New deps require a paragraph-long justification in the PR body. Rejected crates: `pcap-file` / `pcap-parser` (8-10 transitive deps for ~500 lines we write); `ieee80211` (9 mandatory deps for a fraction of its features); `serde`, `regex`, `tokio`, `anyhow`, `thiserror`, `hex`, `nom` (each replaceable inline or out of scope). Future cryptographic primitives may use RustCrypto (`sha1`, `md-5`, `aes`, `cmac`, `hmac`). `cargo deny` (`deny.toml`) gates the supply chain: OSI-permissive licenses only, no unknown registries, no git deps. +Direct runtime dependencies: 5 crates -- `flate2` (gzip, `rust_backend`-only feature), `crc32fast` (per-packet FCS validation and Tier 3 recovery; already a transitive dep via `flate2`, promoted to direct for its SIMD path), `clap` (CLI, derive), `rayon` (parallel Phase 4 pairing via work-stealing), and `sysinfo` (cross-platform RSS + total-RAM queries for memory-pressure detection and `--debug` memory reporting). New deps require a paragraph-long justification in the PR body. Rejected crates: `pcap-file` / `pcap-parser` (8-10 transitive deps for ~500 lines we write); `ieee80211` (9 mandatory deps for a fraction of its features); `serde`, `regex`, `tokio`, `anyhow`, `thiserror`, `hex`, `nom` (each replaceable inline or out of scope). Future cryptographic primitives may use RustCrypto (`sha1`, `md-5`, `aes`, `cmac`, `hmac`). `cargo deny` (`deny.toml`) gates the supply chain: OSI-permissive licenses only, no unknown registries, no git deps. ### Memory budget (informational) -No artificial ceiling. Two compaction passes shape the runtime footprint: `EssidMap` SSID bodies are interned through an `Arc<[u8]>` set so identical SSID broadcasts across APs share one heap allocation, and `MessageStore::add` dedups byte-identical EAPOL frames at insert by `(msg_type, akm, eapol_frame)` so retransmitted M2 / M4 frames collapse before pair generation runs. The runtime is dominated by `MessageStore`, then `EssidMap`, `AkmMap`, `PmkidStore`, `EssidSet`, and the global `DedupSet` (~one `u64` per emitted line). Empirically the footprint scales roughly linearly with input size at well under one GiB per few GiB of mixed-vendor capture data after the compaction passes. wpawolf does not introspect its own memory footprint; operators run `/usr/bin/time -v` or `perf stat` for an authoritative number, or pass `--mem-stats` to print a per-store table at the end of the run. +No artificial ceiling. Two compaction passes shape the runtime footprint: `EssidMap` SSID bodies are interned through an `Arc<[u8]>` set so identical SSID broadcasts across APs share one heap allocation, and `MessageStore::add` dedups byte-identical EAPOL frames at insert by `(msg_type, akm, eapol_frame)` so retransmitted M2 / M4 frames collapse before pair generation runs. The runtime is dominated by `MessageStore`, then `EssidMap`, `AkmMap`, `PmkidStore`, `EssidSet`, and the global `DedupSet` (~one `u64` per emitted line). Empirically the footprint scales roughly linearly with input size at well under one GiB per few GiB of mixed-vendor capture data after the compaction passes. `MemMonitor` (`src/mem_monitor.rs`) probes process RSS against total system RAM throughout the run and flips the stores into the disk-backed fallback at the 80 % threshold (invariant 2); progress lines report RSS, and `--mem-stats` prints a per-store byte-count table at the end of the run. For an external authoritative number, operators run `/usr/bin/time -v` or `perf stat`. --- @@ -658,7 +658,7 @@ Typical size: ~110 B stack + ~140 B shared-heap = ~250 B per message (amortised ### §5.12 Relay (WDS / 4-address) frames -Standard 802.11 frames use three MAC address fields. Mesh and WDS frames use four (To DS = 1, From DS = 1). Both the address interpretation and the BSSID detection change per `[IEEE 802.11-2024]` Table 9-60. wpawolf parses these frames identically and pairs handshakes carried within them without any flag (§4 invariant 4). They are counted in `stats.wds_count`. +Standard 802.11 frames use three MAC address fields. Mesh and WDS frames use four (To DS = 1, From DS = 1). Both the address interpretation and the BSSID detection change per `[IEEE 802.11-2024]` Table 9-60. wpawolf parses these frames identically and pairs handshakes carried within them without any flag (§4 invariant 4). They are counted in `stats.relay_frames`. WDS classification runs in **Phase 1.5** (`src/extract/wds.rs`) after the `essid_map` is fully populated, then walks every deferred frame through a three-tier resolution ladder. Tier 3 always succeeds for syntactically valid EAPOL frames, so resolution does **not** depend on `essid_map` being populated -- a capture with no Beacon / Probe Response still recovers every valid WDS handshake. @@ -825,7 +825,7 @@ Per-location notes follow. Each maps to a `PmkidSource` enum variant. The summar All field names verified against the local Wireshark 4.x field registry (`tshark -G fields`). The S1 KDE filter uses `wlan.rsn.ie.pmkid` (KDE type 4 PMKID dissection); S3-S17 use `wlan.rsn.pmkid.count` (RSN IE PMKID List); S20 uses the dedicated `wlan.osen.pmkid.count` (OSEN IE PMKID dissection). -### §6.6 AKMs that wpawolf parses but does not emit +### §6.6.1 AKMs that wpawolf parses but does not emit Three AKM families produce PMKIDs that wpawolf walks through the extraction path (and counts in stats) but does not turn into a hashcat line: @@ -989,6 +989,15 @@ Handle raw IEEE 802.11 (DLT 105). No link-layer header. Frame starts at offset 0 #### FR-LL-6 Reject unsupported link types with a warning. Do not abort the file - skip packets from that interface. +#### FR-LL-7 +Parse Linux cooked capture headers (DLT 113 SLL, 16-byte header; DLT 276 SLL2, 20-byte header). Fields big-endian per libpcap. The ARPHRD hardware type (`sll_hatype` at offset 2 for SLL, `sll2_hatype` at offset 8 for SLL2) selects the inner payload format: 801 (`ARPHRD_IEEE80211`) -> raw 802.11 at the end of the cooked header; 802 (`ARPHRD_IEEE80211_PRISM`) -> Prism header per FR-LL-3 (including AVS-within-Prism detection); 803 (`ARPHRD_IEEE80211_RADIOTAP`) -> radiotap header per FR-LL-1. Any other ARPHRD value carries no 802.11 frame and is skipped. PPI and standalone AVS have no ARPHRD value and cannot appear inside SLL. [libpcap `pcap-linktype(7)` LINKTYPE_LINUX_SLL / LINKTYPE_LINUX_SLL2; Linux `if_arp.h` ARPHRD constants] + +#### FR-LL-8 +Validate FCS presence by CRC-32 on every frame, every DLT. The 802.11 FCS is standard CRC-32 (ISO 3309 / IEEE 802.3); `crc32(data || fcs)` always equals the residue constant `0x2144DF1C`, so one pass over the stripped payload proves whether a trailing FCS is present. `link::fcs::resolve` combines the CRC verdict with the link-layer header's FCS flag (radiotap Flags bit `0x10`) and the BADFCS flag (radiotap Flags bit `0x40`, `IEEE80211_RADIOTAP_F_BADFCS`) into five outcomes, each with its own §9.2 counter: header and CRC agree (strip), CRC detected an unannounced FCS (strip -- the header was wrong), BADFCS flagged (strip; frame was received corrupt on the air), CRC mismatch with no flag (strip; trust the header), neither (no strip). This replaces the earlier radiotap-flag-only tail-strip: frames whose header never announced an FCS previously fed 4 trailing checksum bytes to the IE walker as body data. + +#### FR-LL-9 +Attempt tiered recovery before dropping a frame whose link-layer strip failed (`src/link/recover.rs`). Tier 2 (`recovered_tier2`): when radiotap `it_len` is corrupt, recompute the expected header length from the `it_present` bitmask field sizes and natural alignment; bail to Tier 3 when the variable-size TLV (bit 28) or vendor-namespace (bit 30) bits are set. Tier 3 (`recovered_tier3`): scan byte offsets 0-144 (the largest known link-layer header, Prism) for the CRC-32 residue match that proves where the 802.11 frame plus FCS starts; minimum 14-byte slice (10-byte minimal control frame + 4-byte FCS) to reject null/FF-padding false positives. Only frames failing all tiers increment `link_errors` and route to the `plcp_error` log category. + ### §8.3 802.11 frame parsing - FR-80211-* #### FR-80211-1 @@ -1051,7 +1060,7 @@ Data frames (type 2): - EAPOL header (4 bytes): Protocol Version (1), Packet Type (1), Body Length (2 BE). Packet Type `3` = EAPOL-Key, Packet Type `0` = EAP-Packet. - Parse EAPOL-Key body per `[IEEE 802.11-2024]` §12.7.2. - Parse EAP frames (Packet Type 0) per RFC 3748. -- Protected Frame bit (FC bit 14): set -> MPDU encrypted. Initial M1/M2/M3/M4 are clear regardless of PMF (§12.7.6 / §12.7.9). Encrypted management frames are logged in `stats.encrypted_mgmt_count` and skipped. +- Protected Frame bit (FC bit 14): set -> MPDU encrypted. Initial M1/M2/M3/M4 are clear regardless of PMF (§12.7.6 / §12.7.9). Encrypted management frames are counted in `stats.mgmt_protected_frames` (protected Action frames additionally in `stats.mgmt_protected_action_skipped`) and skipped. #### FR-80211-4 EAPOL-Key frame Key Information field (2 B, offset 5 from EAPOL body start, BE) per §12.7.2 Figure 12-36: @@ -1093,8 +1102,8 @@ EAPOL frame validation: - KDV=1 iff AKM 1 or 2 with TKIP pairwise (legacy) - KDV=2 iff AKM 1 or 2 with non-TKIP RSNA pairwise - KDV=3 iff AKM 3, 4, 5, or 6 (mandatorily) - - KDV=0 otherwise - - Other values increment `stats.bad_kdv_count` and the frame is skipped. + - KDV=0 otherwise (the "AKM-defined" value used by the SHA-384 families; parses normally and is counted in `stats.eapol_kdv0`) + - Reserved values 4-7 are counted in `stats.eapol_kdv_other` at decode; the EAPOL-Key parser then rejects the frame with the `bad_kdv` reason (surfaced via `stats.eapol_llc_invalid` and the `[eapol_key_rejected]` log category). - Reject frames with all-ones (0xFFFFFFFF) in MIC or nonce. ### §8.4 PMKID extraction - FR-PMKID-* @@ -1114,7 +1123,7 @@ PMKID Count (0 or 2) | PMKID List (16*s) | Group Management Cipher Suite (0 or 4) ``` -All fields after Version are optional - if one is absent, all subsequent are absent. Extract AKM suite selector (OUI `00:0F:AC` + type byte) per Table 9-190; cipher suites per §9.4.2.24.2 Table 9-188; RSN Capabilities per §9.4.2.24.4 Figure 9-374. wpawolf logs the raw RSN Capabilities hex into `stats.rsn_caps_histogram` and uses bits B6/B7 (MFPR/MFPC) to annotate each `(AP, STA)` pair with PMF state. +All fields after Version are optional - if one is absent, all subsequent are absent. Extract AKM suite selector (OUI `00:0F:AC` + type byte) per Table 9-190. The cipher-suite list (§9.4.2.24.2 Table 9-188) and RSN Capabilities field (§9.4.2.24.4 Figure 9-374) are length-walked but not tallied -- neither drives hash emission, and PMF presence is already observable via the frame-level `stats.mgmt_protected_frames` counter, so no per-suite or capabilities histogram is kept. #### FR-PMKID-3 Extract PMKID from Association/Reassociation Request RSN IE. Parse RSN IE from tagged parameters, extract PMKID list. For FT-PSK: also parse MDE (Element ID 54 per §9.4.2.45) for MDID (2 B) and FTE (Element ID 55 per §9.4.2.46) for R0KH-ID (subelement type 3, 1-48 B) and R1KH-ID (subelement type 1, 6 B). @@ -1297,8 +1306,7 @@ Output-filter and runtime flags (unfiltered defaults): | `--dedup-hash-combos` | false | 6 combos -> 3 unique per session | | `--nc-dedup` | false | cluster near-identical nonces, keep one survivor with FLAG_NC (§5.8.1) | | `--nc-tolerance` *n* | 8 | cluster span tolerance for `--nc-dedup`; ignored unless `--nc-dedup` set | -| `--strict` | false | bundle: `--eapoltimeout=5 --rc-drift=8 --dedup-hash-combos --per-file --nc-dedup` | -| `--per-file` | false | pair + emit + clear MessageStore/PmkidStore per input file | +| `--strict` | false | bundle: `--eapoltimeout=5 --rc-drift=8 --dedup-hash-combos --nc-dedup` | | `--threads` *n* | CPU count | Phase 4 worker thread count | | `--essid-collapse-min` *n* | 3 | multi-SSID collapse guard: minimum distinct SSIDs before collapse fires | | `--essid-collapse-ratio` *n* | 10 | multi-SSID collapse guard: top-count / second-count ratio threshold | @@ -1309,21 +1317,27 @@ Output-filter and runtime flags (unfiltered defaults): #### FR-CLI-4 Info flags: `-h` / `--help`, `-v` / `--version` provided by `clap`. The summary statistics are printed unconditionally to stdout on every run. stderr produces no output. -`--log` categories (lowercase tags, written by `src/log.rs`): +`--log` is a **triage tool** (`src/log.rs`): it records events where wpawolf dropped, skipped, or rejected data for non-obvious reasons. Obvious high-volume events (null-kind nonce / MIC / PMKID rejections, out-of-sequence timestamps, FCS outcomes, recovery tiers) are stats-banner-only and produce no log lines. -- `malformed_frame` - truncated or structurally invalid 802.11 / EAPOL data -- `plcp_error` - link-layer header validation failed (radiotap / PPI / Prism / AVS error, or an unsupported DLT) +Per-event categories (one line per event, written immediately; every per-frame line carries `file=` and `frame=` context): + +- `eapol_key_rejected` - EAPOL-Key frame passed the LLC/packet-type gate (EtherType `0x888E`/`0x88C7`, packet type = 3) but failed the EAPOL-Key parser for a structural reason other than a garbage nonce or MIC (those are captured by `[invalid_nonce]` / `[invalid_mic]`). Carries `ap=`, `sta=`, `reason=` (one of `bad_llc_header`, `bad_ethertype`, `truncated_short`, `bad_descriptor_type`, `bad_kdv`, `truncated_24mic`, `classify_flags_invalid`), and `bytes=` (first 32 raw bytes in lowercase hex for Wireshark cross-reference). Only genuinely structural failures appear here; spec-correct M4 null-nonce drops are stats-only (`null_nonce_rejected` on-M4 sibling row) +- `invalid_nonce` - EAPOL frame discarded: nonce matched a non-obvious garbage pattern (`ff` / `repeat_1` / `repeat_2` / `repeat_4`; `null`-kind rejections are stats-only and suppressed from the log). Line carries `ap= sta= msg_type= kind= nonce_hex=<32 B hex>` so downstream tooling can filter by pattern and grep the source capture for the rejected bytes +- `invalid_mic` - EAPOL frame discarded: MIC matched a non-obvious garbage pattern with the Key MIC flag set (M2/M3/M4; `null` suppressed as above). Line carries `ap= sta= msg_type= kind= mic_hex=<16/24 B hex>` (16 for the SHA-1/SHA-256 families, 24 for SHA-384) +- `invalid_pmkid` - PMKID discarded: matched a non-obvious garbage pattern (`null` suppressed as above). Line carries `ap= sta= kind= pmkid_hex=<16 B hex>` - `unknown_linktype` - pcapng EPB referenced an `interface_id` for which no preceding IDB exists; the packet is dropped -- `unknown_akm` - AKM suite type outside [IEEE 802.11-2024] Table 9-190 -- `essid_not_found_summary` - per-AP summary: the AP's SSID was never observed, so every would-have-been-emitted hash line for it was dropped at output time as uncrackable. One line per affected AP at end of run; carries `ap=`, `dropped=N`, `first_seen_us=`, `last_seen_us=` so the operator can locate the source frames in the original capture - `capture_read_error` - per-file ingest error (typically a truncated trailing packet record per FR-IN-10); the file is closed and the run continues -- `invalid_nonce` - EAPOL frame discarded: nonce matched a garbage pattern (`null` / `ff` / `repeat_1` / `repeat_2` / `repeat_4` on any message type, M4 included). M4 NULL nonce is spec-valid on the wire per §12.7.6.5 NOTE 9 but is dropped because the hash line is cryptographically dead; see §5.10. Line carries `kind= nonce_hex=<32 B hex>` so downstream tooling can filter by pattern and an operator can grep the source capture for the rejected bytes -- `invalid_mic` - EAPOL frame discarded: MIC matched a garbage pattern (`null`, `ff`, `repeat_1`, `repeat_2`, `repeat_4`) with the Key MIC flag set (M2/M3/M4). Line carries `kind= mic_hex=<16/24 B hex>` (16 for AKMs 1-6, 8, 9, 11; 24 for the SHA-384 family) -- `invalid_pmkid` - PMKID discarded: matched a garbage pattern (`null`, `ff`, `repeat_1`, `repeat_2`, `repeat_4`). Line carries `kind= pmkid_hex=<16 B hex>` -- `eapol_key_rejected` - EAPOL-Key frame passed the LLC/packet-type gate (EtherType `0x888E`/`0x88C7`, packet type = 3) but failed the EAPOL-Key parser for a structural reason other than a garbage nonce or MIC (those are already captured by `[invalid_nonce]` / `[invalid_mic]`). Carries `timestamp_us`, `ap=`, `sta=`, `reason=` (one of `truncated_short`, `bad_descriptor_type`, `bad_kdv`, `truncated_24mic`, `classify_flags_invalid`), and `bytes=` (first 32 raw bytes in lowercase colon-hex for Wireshark cross-reference). Only the ~10 genuinely structural failures per multi-GB corpus appear here; the ~15 600 spec-correct M4 null-nonce drops are already fully described by `[invalid_nonce] kind=null msg_type=m4` -- `essid_control_bytes` - SSID informational notice, **not a discard and not a sign wpawolf altered the SSID**: the SSID byte run contained at least one byte in `0x00..=0x1F` (the full ASCII C0 control range, NUL through US -- every control character). Per [IEEE 802.11-2024] §9.4.2.2 the SSID element is "an arbitrary sequence of 0-32 octets" with no printable-character requirement, so a control-byte SSID is valid on the wire; wpawolf is required to handle it and ships the byte run to hashcat unchanged. The line carries `essid_hex=` in lowercase hex so the operator triaging a capture can locate the source frame. SSIDs that fail the spec-driven length / first-byte-zero gate are discarded silently by upstream counters and are NOT logged +- `skipped_input` - input file could not be classified by magic bytes (sub-4-byte stubs, non-capture files passed explicitly); counted in `files_skipped_unknown_format` +- `essid_not_found_summary` - per-AP summary: the AP's SSID was never observed, so every would-have-been-emitted hash line for it was dropped at output time as uncrackable. One line per affected AP at end of run; carries `ap=`, `dropped=N`, `first_seen_us=`, `last_seen_us=` so the operator can locate the source frames in the original capture -Format: `[category] `. Per-category field layout matches the `Logger::log_*` method signatures. Frame-bearing categories (`malformed_frame`, `plcp_error`, `invalid_nonce`, `invalid_mic`, `invalid_pmkid`, `essid_control_bytes`) lead with `timestamp_us`; `unknown_linktype`, `unknown_akm`, `essid_not_found_summary`, and `capture_read_error` do not (the event has no single packet timestamp; the summary line carries its own `first_seen_us` / `last_seen_us` range fields). +Aggregated categories (accumulated during the run, flushed as per-reason summary lines with counts -- a noisy capture cannot flood the log): + +- `plcp_error` - link-layer strip failed after all FR-LL-9 recovery tiers were exhausted; one summary line per (reason, DLT) pair +- `malformed_frame` - 802.11 MAC header truncated or structurally invalid; one summary line per reason +- `unknown_akm` - AKM suite type outside [IEEE 802.11-2024] Table 9-190; one summary line per AKM byte +- `essid_control_bytes` - SSID informational notice, **not a discard and not a sign wpawolf altered the SSID**: the SSID byte run contained at least one byte in `0x00..=0x1F` (the full ASCII C0 control range). Per [IEEE 802.11-2024] §9.4.2.2 the SSID element is "an arbitrary sequence of 0-32 octets" with no printable-character requirement, so a control-byte SSID is valid on the wire; wpawolf ships the byte run to hashcat unchanged. Single summary line with a total count; SSIDs that fail the spec-driven length / first-byte-zero gate are discarded silently by upstream counters and are NOT logged + +Format: `[category] key=value key=value ...`. Per-category field layout matches the `Logger::log_*` method signatures. MAC addresses are bare lowercase hex (12 chars, no separators); hex byte fields are contiguous lowercase hex; non-integer values are double-quoted, integer and hex-only values stay bare. There is no `ts=` field -- `file=` / `frame=` locate the source packet exactly. ### §8.9 Correctness, performance, dependencies, build, threading @@ -1358,7 +1372,7 @@ Estimated memory for typical captures - see §4 memory budget table. wpawolf does not introspect its own memory footprint; process memory is measured externally via `/usr/bin/time -v` or `perf stat`. #### FR-DEP-1 -Direct dependency count: 4 crates (`flate2` + `clap` + `rayon` + `sysinfo`). `flate2` pulls `miniz_oxide`; `clap` pulls its derive/builder proc-macro ecosystem; `rayon` pulls `crossbeam-deque`/`crossbeam-epoch`; `sysinfo` is self-contained on Linux (adds `core-foundation-sys` on macOS). +Direct dependency count: 5 crates (`flate2` + `crc32fast` + `clap` + `rayon` + `sysinfo`). `flate2` pulls `miniz_oxide` and already pulled `crc32fast` transitively (promoting it to direct adds zero new supply-chain surface); `clap` pulls its derive/builder proc-macro ecosystem; `rayon` pulls `crossbeam-deque`/`crossbeam-epoch`; `sysinfo` is self-contained on Linux (adds `core-foundation-sys` on macOS). #### FR-DEP-2 No large async / serialisation frameworks. All hex encoding, pcap / pcapng / 802.11 / EAPOL parsing is implemented inline. @@ -1391,145 +1405,9 @@ Phase 1 (parsing) is I/O-bound and sequential for a single file. Multi- file ing ## §9 Stats catalogue -The closing summary is hcxpcapngtool-shaped: anyone who has read `hcxpcapngtool` output should be able to read wpawolf output without a glossary. We match hcxpcapngtool's line set as the floor and add more where the upstream tool is missing data. The summary is reorganised into five banner sections (one per pipeline phase) so an operator can immediately see which phase a parse failure occurred in. - -### §9.1 Phase 1 (Ingest) counters - -- `input_file_count` -- regular files actually opened by the ingest loop. Single-file runs render the original `file name / file format / endian / network type` quartet for hcxpcapngtool parity; multi-file runs (when positional args expand to more than one capture, typically from a recursive directory walk) instead surface a histogram-style banner: `input files processed`, `file formats seen` (e.g. `pcap 2.4 (12), pcapng 1.0 (3)`), `endians seen`, `network types seen`, `last file processed`. -- `file_formats_seen` / `endians_seen` / `dlt_descs_seen` -- `BTreeMap` histograms populated once per file from the reader's `FileMetadata`. Sorted by descending count (then key) at display time so an operator can spot a single odd file in a large multi-capture run. -- Truncated-trailing-record count (`truncated_capture_files`, `unreadable_packets`); MAC-header-malformed count (`malformed_mac_hdr`); link/parse error count (`link_errors`); forgiven non-zero Protocol Version frames (`lenient_proto_version`). -- FCS framing count (radiotap Flags bit 0x10). -- Multi-member gzip stream count. - -Every "issue" stat is suffixed with whether the count means data was **dropped** (frames or hashes lost), **recovered** (the issue was worked around and the data was processed), or **diagnostic** (the issue was noted but had no data impact). For example `link/parse errors (frames dropped)`, `frames with non-zero Protocol Version (forgiven; processed)`, `capture files with truncated trailing record (earlier records kept)`. This convention applies across every phase. - -### §9.2 Phase 2 (Decode) counters - -- Per-DLT packet counts (105, 119, 127, 163, 192) - parser-health signal for radiotap / PPI / Prism mismatch. -- Radiotap vendor-namespace blocks skipped; AVS-within-Prism frames detected. -- Per-band packet counts (2.4 / 5 / 6 GHz) from radiotap Channel field; beacon channel distribution from DS Parameter Set IE (tag 3). -- WDS / 4-address frame count (`stats.wds_count`); frame-type histogram (mgmt / ctrl / data); encrypted management frame count (`stats.encrypted_mgmt_count`); malformed MAC header counter. -- A-MSDU aggregation: `stats.amsdu_frames_seen` (Data frames with A-MSDU bit set) and `stats.amsdu_subframes_total` (subframes parsed for hidden EAPOL). -- Radiotap FCS: `stats.fcs_stripped_frames` (frames whose trailing 4-byte FCS was tail-stripped because radiotap Flags bit `0x10` signalled `IEEE80211_RADIOTAP_F_FCS`). -- EAPOL Key Descriptor Version histogram: `stats.eapol_kdv1` (HMAC-MD5 / WPA legacy), `eapol_kdv2` (HMAC-SHA1 / WPA2-PSK family), `eapol_kdv3` (AES-CMAC / PSK-SHA256 family), `eapol_kdv_other` (KDV=0 or reserved). Plus `stats.eapol_rsn` vs `stats.eapol_wpa` for the descriptor type byte (0x02 RSN vs 0xFE WPA legacy). Drives the KDV-first AKM reconciliation in `store_eapol_key`. - -### §9.3 Phase 3 (Extract) counters - -**Management subtype counts.** - -- Beacons, Probe Requests (directed / undirected), Probe Responses. -- Association Requests, Reassociation Requests with per-AKM breakdown (PSK / FT-PSK / PSK-SHA256 / SAE / OWE). -- Authentication frames per algorithm (Open System / Shared / SAE / FBT / FILS / PASN / Network-EAP / unknown). -- Action frames: total + containing ESSID. AWDL. -- Deauthentication, Disassociation. Reason Code histogram per `[IEEE 802.11-2024]` §9.4.1.7 Table 9-90 with these promoted to their own counters: - - Reason 14 "Message integrity code (MIC) failure" -> `stats.mic_failure_deauths` (canonical "this handshake will never pair cleanly" signal). - - SAE status 77 "Authentication is rejected because the offered finite cyclic group is not supported" -> `stats.sae_group_rejected` (WPA3 equivalent of the FT failure signals). -- Authentication response Status Code per §9.4.1.9 Table 9-92, with these promoted: - - 52 "R0KH unreachable" -> own counter. - - 53 "Invalid PMKID" -> own counter. - - 54 "Invalid MDE" -> own counter. - - 55 "Invalid FTE" -> own counter. - -**ESSID counters.** - -- Total unique, SSID wildcard / unset, zeroed SSID, oversized SSID, ESSID changes per AP. -- `essid_unresolved_emissions` / `essid_unresolved_aps` -- hash lines dropped at output time because no ESSID was ever observed for the AP (uncrackable per FR-ESSID-3), and the count of distinct AP MACs contributing those drops. Each affected AP also produces one `[essid_not_found_summary]` line in `--log` carrying `dropped=N`, `first_seen_us=`, `last_seen_us=`. - -**Multi-SSID inflation -- why this exists.** - -Hash extraction is a per-(AP, SSID) cartesian product: every recorded SSID for an AP produces its own hash line because the PMK derivation binds PSK + SSID. In a clean capture this is the right behaviour -- dual-band ("Home-2g" / "Home-5g") and 3-SSID enterprise rollouts ship 2-3 lines per AP and a downstream tool cracks whichever applies. - -In a corpus with RF-rotted captures, one physical AP can produce 4-30+ "distinct" SSIDs that are all bit-flipped variants of one real broadcast. The fanout inflates one crackable handshake into N uncrackable lines plus one crackable line, polluting the queue. The per-AP fanout is also load-bearing on the scan-line yield: a thousand corrupted APs with mean fanout 6 add ~5000 line-equivalents that nobody can solve. - -`--essid-collapse-min` / `--essid-collapse-ratio` collapse the inflation when both axes agree. The collapse minimum (default 3) keeps genuine multi-SSID setups untouched; the collapse ratio (default 10) keeps APs with no clear primary SSID untouched (e.g. a CTF AP cycling through 11 named SSIDs with similar counts). Both must trip for the collapse to fire, so a singleton corruption (`SSID-A x4192`, `SSID-B x3`) drops the corruption while a 4-SSID load-balanced rollout (counts 100/95/90/85) ships every SSID. Defaults are tuned against a representative multi-SSID sample drawn from real-world captures: most multi-SSID hash-producing APs broadcast 2 SSIDs (genuine dual-band), a smaller fraction broadcast 3 (segmented rollouts), and outliers exhibit clear RF-rot patterns. See README "When one AP shows up under many SSIDs" for a worked example. - -**EAPOL counters.** - -- M1 / M2 / M3 / M4 totals, oversized, FT-using-PSK. -- Max EAPOL authentication length seen per message type. -- Replay-counter gap histogram, EAPOLTIME gap (max ms). -- ANonce error corrections, M4 zeroed-nonce count, M1 4E4 authorized variants. -- Rejection counters: `null_nonce_rejected`, `ff_nonce_rejected`, `repeat_nonce_rejected`, `null_mic_rejected`, `ff_mic_rejected`, `repeat_mic_rejected`, `null_pmkid_rejected`, `ff_pmkid_rejected`, `repeat_pmkid_rejected`, `bad_kdv_count`. -- Informational counter: `essid_control_bytes_warned` (SSIDs that survived the spec gate but contained at least one byte in `0x00..=0x1F`; not a rejection, not a transformation -- the SSID byte run is shipped to hashcat unchanged). -- WDS direction tier breakdown: `eapol_tier1_direction`, `eapol_tier1b_essid`, `eapol_tier2_ack_discovery`, `eapol_tier3_flag_fallback`, `eapol_ack_mismatches`. -- `eapol_preauth_frames` -- LLC/SNAP `EtherType` `0x88C7` frames per [IEEE 802.11-2024] §12.3.2; counted alongside standard `0x888E` so inter-AP preauth traffic is visible. -- `eapol_llc_invalid` -- frames where the LLC/SNAP `EtherType` was `0x888E` / `0x88C7` AND the EAPOL Packet Type byte was 3 (EAPOL-Key) but the EAPOL-Key parser bailed (truncated body, bad descriptor, sentinel-rejected MIC/nonce). EAP-Packet (type 0), EAPOL-Start (1), and EAPOL-Logoff (2) are legitimate non-key frames and do **not** increment this counter. -- `mesh_control_frames` -- mesh BSS Data frames whose Mesh Control header was successfully skipped per §9.2.4.8.3, recovering an inner MSDU for downstream EAPOL/EAP processing. -- `eap_success_frames` / `eap_failure_frames` -- terminal EAP outcome codes (RFC 3748 §4.2). Stats-only; carry no identity data and never affect hash extraction. Drives capture-quality triage for mixed PSK / Enterprise traffic. - -**Plaintext extraction surfaces.** Every IE / vendor-IE / action-frame field that yields wordlist-grade plaintext is parsed and counted. The catalogue below is the contract -- a regression that drops one of these surfaces is a `tests/integration/extraction_coverage.rs` failure. - -| Surface | Spec / source | Counter | Sink | -|---|---|---|---| -| SSID (tag 0) | §9.4.2.2 | (always) | `essid_set`, `essid_map`, wordlist | -| SSID List (tag 84) | §9.4.2.71 | `ssid_list_entries` | `essid_set`, wordlist | -| Mesh ID (tag 114) | §9.4.2.97 | `mesh_ids_extracted` | `essid_map`, `essid_set`, wordlist | -| Country (tag 7) | §9.4.2.9 | `country_codes_extracted` | wordlist | -| Time Zone (tag 98) | §9.4.2.85 | `time_zones_extracted` | wordlist | -| WPS device info (vendor IE OUI `00:50:F2` type 4) | WPS spec §12 | (per field) | wordlist + `device_store` | -| OWE Transition SSID (vendor IE OUI `50:6F:9A` type 28) | WFA OWE §4 | `owe_transition_ssids` | `essid_map`, wordlist | -| Cisco CCX1 AP name (tag 133) | Cisco CCX v1 §A.3 | `ccx1_ap_names_extracted` | wordlist | -| Vendor AP names (tag 221, multiple OUIs) | wireshark `packet-ieee80211.c` | `vendor_ap_names_extracted` | wordlist | -| Multiple BSSID profile (tag 71 / sub-BSSID) | §9.4.2.45a + §35.2.2 | `multiple_bssid_profiles` | `essid_map` | -| Reduced Neighbor Report BSSIDs (tag 201) | §9.4.2.170 | `rnr_bssids_extracted` | stats only (MAC, not seeded into -W) | -| Wi-Fi Direct (P2P) device name (vendor IE OUI `50:6F:9A` type 9) | WFA Wi-Fi Direct | `p2p_device_names_extracted` | wordlist | -| FILS Discovery SSID (Public Action 34) | §9.6.7.36 | `fils_discovery_ssids` | `essid_map`, `essid_set`, wordlist | -| Action Neighbor Report SSID (Action cat 5) | §9.6.6.6 | `action_nr_req_ssids` | `essid_set` | -| ANQP Venue Name (Info ID 258) | §9.4.5 | (per element) | wordlist | -| ANQP Domain Name List (Info ID 263) | §9.4.5 | (per element) | wordlist | -| ANQP NAI Realm (Info ID 268) | §9.4.5.10 | (per element) | wordlist | -| ANQP Hotspot 2.0 Operator Friendly Name | HS2.0 Tech Spec §4.3 | (per element) | wordlist | -| EAP-Identity (Code 1/2 Type 1) | RFC 3748 §5.1 | (always) | `identity_set`, `username_set`, wordlist | -| EAP outcome (Code 3/4) | RFC 3748 §4.2 | `eap_success_frames`, `eap_failure_frames` | stats only | - -Out of scope: DPP / Wi-Fi Easy Connect (§1), pure SAE / OWE authentication frames (no PSK to crack), Roaming Consortium / BSS Load / Interworking / RSNXE / DMG capabilities (numeric-only IEs, no plaintext value), Short SSID (irreversible CRC-32). - -**MSDU fragment reassembly counters** (`stats.fragment_stats`, populated by `src/store/fragments.rs`). - -- `fragments_seen` -- non-final fragments buffered for later concatenation. -- `fragments_reassembled` -- final fragments that completed an MSDU and triggered `take_completed`. -- `fragments_dropped_disorder` -- final fragment arrived without a matching fragment-0 (orphan). Body is still passed through the EAPOL parser as a single MSDU in case of glitched MoreFrag bits on what is actually a complete frame. -- `fragments_dropped_overflow` -- in-flight buffer hit `MAX_ENTRIES` and the oldest entry was evicted to make room for a new fragment-0. - -**PMKID counters per source (S1-S20).** Each `PmkidSource` variant has its own counter. Also: total, useful, useless, faulty, best. - -**EAP / RADIUS / TACACS+ counters.** EAP ID, EAP request/response, method breakdown (MD5, LEAP, MSCHAPv2, PEAP, TLS, TTLS, SIM, AKA, Expanded), RADIUS Access-Request/Challenge/Accept/Reject, TACACS+ AUTHEN/AUTHOR/ACCT. v1 counts only; v2 writes the hashcat-compatible output. - -**IP / transport counters** (informational): IPv4, IPv6, TCP, UDP, ICMPv4, ICMPv6, GRE. - -**RSN capabilities histogram** (`stats.rsn_caps_histogram`): raw 2-byte hex distribution per §9.4.2.24.4 Figure 9-374. B6/B7 drive per-`(AP, STA)` PMF annotation. - -**Cipher suite counters.** Per-suite counts under OUI `00:0F:AC` (CCMP-128, GCMP-128, GCMP-256, CCMP-256, BIP-CMAC-128, BIP-GMAC-128, BIP-GMAC-256, BIP-CMAC-256, TKIP, WEP-40, WEP-104). `stats.unknown_cipher_count` for unrecognised selectors; `stats.vendor_cipher_count` for non-`00:0F:AC` OUIs. - -### §9.4 Phase 4 (Emit) counters - -- EAPOL pairs: total, useful, best, ignored-oversized, written-to-22000, written-to-37100, rogue pairs, pairs-from-zeroed-PMK, pairs-from-zeroed-PSK. -- Per N#E# combo counts (six counters: N1E2, N1E4, N3E2, N2E3, N4E3, N3E4) - individually before dedup, plus equivalence-class survivor counts after `--dedup-hash-combos` collapse. -- RSN PMKID emission: total, useful, useless, faulty, best, PSK, FT-PSK, rogue, from-zeroed-PMK, from-zeroed-PSK, written-to-22000, written-to-37100. -- Per-AKM hash-emission decisions: counts by AKM selector (`00-0F-AC:x`) of hashes emitted vs suppressed vs counted-only, with the Table number cross-referenced to §6 in the line label. -- Per-type-code line counts: 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11 (the 11-type classification of §2). One counter per output sink: `--22000-out`, `--37100-out`, `-o`, `--wpa1-out`, `--wpa2-out`, `--psk-sha256-out`, `--ft-out`, `--psk-sha384-out`, `--ft-psk-sha384-out`. -- FT specifics: R0KH-ID / R1KH-ID / MDID observed counts. -- Dedup stats: fingerprint collisions per line-kind byte, unique lines written per output file, duplicates suppressed. - -### §9.5 Phase 5 (Report) counters - -- Wallclock breakdown per phase. -- OWE Transition Mode pairs (`stats.owe_transition_pairs`). -- MLO capture detected (`stats.mlo_capture_detected`). -- Weird-format counts: Kuznetzov pcap records, AVS-within-Prism frames, pcapng nanosecond-resolution interfaces, multi-SHB pcapng files. -- NMEA / GPS records observed (count only in v1; structured GPS output deferred to v2 via `--nmea-out`). - -### §9.6 Hcxpcapngtool parity - -Any stat, frame type, or metadata category `hcxpcapngtool` emits, wpawolf emits too. Where `hcxpcapngtool` silently drops: - -- FT-PSK frames > 255 B (wpawolf has no size gate per §4 invariant 3). -- Global-dedup-across-the-whole-capture (wpawolf has SipHash global set per §4 invariant 5). -- Relay frames without `--all` (wpawolf processes WDS unconditionally per §4 invariant 4). - -wpawolf emits and documents the difference. The Phase 8 superset test in `tests/integration/superset_test.rs` enforces this parity at every release. +The closing stats banner -- its full row-by-row contract (label, backing field, spec source, why-we-care, and drop behaviour for every line), the four reconciliation identities, the W=60 / 58-char-label formatting rules, the four-class disposition taxonomy, and the hcxpcapngtool parity-and-exclusion list -- lives in its own file, [`STATS.md`](STATS.md). That file is the authoritative catalogue: `make audit-stats` (`tools/audit_stats.sh`, run by `make check-all`) asserts that every public field of `Stats` (`src/stats.rs`) and `FragmentStats` (`src/store/fragments.rs`) is documented there, in both directions, so the banner and the contract cannot drift apart. The banner is rendered by `Stats::summary_string`, printed unconditionally to stdout by `Stats::print_summary`, and organised into five sections (one per pipeline phase). The release-time cross-version verification that complements the per-run banner is documented below. -### §9.7 Operational verification -- cross-version comparison +### §9.1 Operational verification -- cross-version comparison Beyond unit + fixture tests, wpawolf is verified at release time against a local multi-vendor capture set by re-running every prior release plus the current `HEAD` binary in both `WIDE` (bare) and `STRICT` (`--strict` bundle) modes alongside the upstream `hcxpcapngtool` in `default` and `wide` modes, then sorted-unique-diffing the resulting hashcat lines per capture and across the run. The verification scripts and their inputs are developer-local (kept out of the repository) since they reference operator-side capture paths; the methodology and the three invariants they pin are documented here so any contributor can reproduce them on their own captures. @@ -1537,7 +1415,7 @@ The verification pins three invariants: 1. **Cross-version drops.** For each adjacent (older, newer) version pair, every line emitted by the older binary on a capture must also appear in the newer binary's output. Any drop must trace to a documented intentional spec-compliance transition (e.g. v0.3.5's Mesh Control bit gate, v0.3.6's MessageStore dedup-on-insert) -- never to a regression. 2. **Superset invariant.** `hcx-default ⊆ wpawolf-HEAD-WIDE` per capture. Any hcx-only line must trace to a documented per-(AP, STA) precision difference (a different `message_pair` flag byte for the same body, i.e. a body-matched diff -- not a genuinely missing handshake) -- never to a missing line. The FLAG_NC three-source rule (CC-1, see §5.7) and the FT-PSK PMKID `message_pair` byte (see §6.7 and `hcxpcapngtool.h:386-390`) are the two output-format fixes that closed the bulk of pre-v0.3.7 violations; residual differences are all body-matched flag-byte differences attributable to hcx-default's data-structure quirks (AP-wide M1 cross-leakage and 20-entry eviction window). -3. **Mode parity `STRICT ⊆ WIDE`.** For every (capture, version, channel) tuple, the STRICT line-set must be a subset of the WIDE line-set. The `--strict` bundle (`--eapoltimeout` / `--rc-drift` / `--dedup-hash-combos` / `--per-file` / `--nc-dedup`) is a pure output filter; none of its passes can synthesize lines the WIDE pipeline did not produce. Any violation is a P0 STRICT-mode logic bug. The fixture-level test `tests/integration/mode_parity_strict_subset_wide.rs` gates the same invariant in CI without requiring an external capture set. +3. **Mode parity `STRICT ⊆ WIDE`.** For every (capture, version, channel) tuple, the STRICT line-set must be a subset of the WIDE line-set. The `--strict` bundle (`--eapoltimeout` / `--rc-drift` / `--dedup-hash-combos` / `--nc-dedup`) is a pure output filter; none of its passes can synthesize lines the WIDE pipeline did not produce. Any violation is a P0 STRICT-mode logic bug. The fixture-level test `tests/integration/mode_parity_strict_subset_wide.rs` gates the same invariant in CI without requiring an external capture set. --- @@ -1578,16 +1456,17 @@ src/ main.rs entry point, arg parsing, orchestration lib.rs public API for integration tests input/ Phase 1 (§3.1) - mod / pcapng / pcap / gzip - link/ Phase 2 (§3.2) - mod / radiotap / ppi / prism / avs + link/ Phase 2 (§3.2) - mod / radiotap / ppi / prism / avs / sll / fcs / recover ieee80211/ Phase 2 (§3.2) - mod / frame / ie / rsn / ft / eapol / eap / amsdu / anqp extract/ Phase 3 (§3.3) - per-frame handlers routing to stores - store/ Phase 3 (§3.3) - mod / messages / pmkid / essid / fragments / auxiliary + store/ Phase 3 (§3.3) - mod / messages / pmkid / essid / fragments / auxiliary / disk_messages pair/ Phase 4 (§3.4) - mod / combos / constraints / collapse / nc_dedup - output/ Phase 4 (§3.4) - mod / hashcat / wordlists / device_info / dedup + output/ Phase 4 (§3.4) - mod / hashcat / wordlists / device_info / dedup / disk_dedup stats.rs Phase 5 (§3.5) - counters, summary progress.rs periodic progress line emitter debug.rs --debug diagnostic mode log.rs structured logging + mem_monitor.rs RSS monitor driving the disk-backed fallback mem_stats.rs --mem-stats per-store footprint table strings_scan.rs --wordlist-scan IE plaintext scanner types.rs shared: MacAddr, MacPair, MsgType, AkmType, MicBytes, Error @@ -1629,7 +1508,7 @@ pub struct DedupSet { seen: HashSet } - Unit tests colocated with modules (`#[cfg(test)] mod tests`). - Integration tests in `tests/integration/*.rs`: - `superset_test.rs` runs both `hcxpcapngtool` and `wpawolf` on the same capture and asserts `wpawolf_output >= hcxpcapngtool_output` line by line. The "never miss a hash" regression oracle. - - `per-AKM format_outputs_per_akm.rs`, `per-AKM format_combined_o.rs`, `per-AKM format_dedup_per_sink.rs` -- per-sink fan-out and dedup checks for the 11-type classification outputs. + - `extended_outputs_per_akm.rs`, `extended_combined_o.rs`, `extended_dedup_per_sink.rs` -- per-sink fan-out and dedup checks for the 11-type classification outputs. - `pmkid_coverage.rs` -- crafted in-memory pcap exercising the 20 spec-defined PMKID extraction sites; asserts no-dup, WPA*01* field count = 9, WPA*03* field count = 12. - `cross_file_pairing.rs` -- M1 in file A, M2/3/4 in file B; asserts the shared `MessageStore` reassembles the handshake. - `fragment_reassembly.rs` -- 802.11 MSDU fragment reassembly per `(SA, RA, SeqNum)` for FT-PSK M2 frames split by the radio MTU. @@ -1641,5 +1520,5 @@ pub struct DedupSet { seen: HashSet } Rust 2024 edition, stable toolchain pinned in `rust-toolchain.toml`. `Cargo.toml` enforces `unsafe_code = "forbid"`; `lib.rs` re-states `#![forbid(unsafe_code)]`. Clippy: `all` at `deny`, `pedantic` / `nursery` / `cargo` at `warn`. `unwrap_used`, `expect_used`, `panic`, `indexing_slicing` at `warn`. `dbg_macro`, `todo`, `unimplemented`, `mem_forget` at `deny`. Cast lints (`cast_possible_truncation`, `cast_sign_loss`, `cast_precision_loss`, `cast_possible_wrap`) at `warn`. `wildcard_imports` at `deny` (tests may `#[allow]`). `.cargo/config.toml` sets `rustflags = ["-D", "warnings"]`. -`make check-all` runs `fmt`, `lint` (clippy zero warnings), `audit` (cargo deny), `check`, `test`, `doc` (rustdoc `-D warnings`), `hygiene` (ASCII + LF), `machete` (unused deps). +`make check-all` runs `fmt`, `lint` (clippy zero warnings), `audit` (cargo deny), `audit-citations` (hcxpcapngtool line-citation check), `audit-stats` (§9 banner-contract vs `src/stats.rs` drift gate), `check`, `test`, `doc` (rustdoc `-D warnings`), `hygiene` (ASCII + LF), `machete` (unused deps). diff --git a/Makefile b/Makefile index e029286..35e9a5c 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ .PHONY: fmt fmt-fix lint lint-fix doc dev build check \ test test-release test-matrix check-parity \ - audit audit-citations machete \ + audit audit-citations audit-stats machete \ ascii-check lf-check hygiene \ build-linux-musl build-linux-arm-musl \ build-macos-arm build-macos-x86 build-macos-universal \ @@ -122,6 +122,13 @@ audit: audit-citations: ./tools/audit_citations.sh +# Verify ARCHITECTURE.md section 9 (the stats banner contract) against the +# actual counter fields in src/stats.rs and src/store/fragments.rs, both +# directions. Catches drift when a counter is added, renamed, or removed +# without updating the catalogue -- the doc-rot class found in the 2026-06 audit. +audit-stats: + ./tools/audit_stats.sh + machete: $(CARGO) machete @@ -345,4 +352,4 @@ clean: # -- Gates --------------------------------------------------------------------- # Full verification gate -- run before every push. -check-all: fmt lint audit audit-citations check test-matrix doc hygiene machete +check-all: fmt lint audit audit-citations audit-stats check test-matrix doc hygiene machete diff --git a/STATS.md b/STATS.md new file mode 100644 index 0000000..d9b69fc --- /dev/null +++ b/STATS.md @@ -0,0 +1,256 @@ +# STATS.md -- the stats banner contract + +This file is the **authoritative, line-by-line catalogue** of wpawolf's closing stats banner. It supersedes the old `ARCHITECTURE.md §9` catalogue; `ARCHITECTURE.md §9` is now a one-paragraph pointer here. The contract is machine-checked: `make audit-stats` (`tools/audit_stats.sh`, wired into `make check-all`) asserts that every public field of `Stats` (`src/stats.rs`) and `FragmentStats` (`src/store/fragments.rs`) is named in backticks in this file, and that every `stats.` reference in the docs is a real field. A counter cannot be added, renamed, or removed without this file moving in lockstep. + +The banner is rendered by `Stats::summary_string` and printed by `Stats::print_summary`. Every run prints it unconditionally to stdout (FR-CLI-4); stderr stays silent. + +## Design principle + +Every packet wpawolf reads, and every classification it makes, lands in a counter. The banner is built so an operator can **account for every packet** and **see every drop**: nothing the tool decides about a frame is thrown away silently. The four reconciliation identities in the next section are the formal statement of that principle, and `tools/audit_stats.sh` plus the `banner_labels_fit_column` unit test keep the catalogue and the code from drifting apart. + +## Reconciliation identities + +These are the sums an operator can check against the banner. They hold by construction; a future change that breaks one is a bug. + +1. **Packet accounting (Phase 1 + 2).** Every packet that `next_packet` yielded reaches exactly one terminal disposition: + + ``` + total_packets = packets_unknown_linktype (dropped: no IDB) + + link_errors (dropped: link strip failed all recovery tiers) + + ctrl_frames (control frames: not body-parsed) + + malformed_mac_hdr (dropped: MAC header unparseable) + + truncated_after_header (dropped: body past captured length) + + extension_frames (802.11 extension frames: not body-parsed) + + mgmt_frames + data_frames (handed to Phase 3 extraction) + ``` + + `unreadable_packets` is the count of records that errored on read; those never entered `total_packets` (the counter increments only on a successful `next_packet`). `recovered_tier2` / `recovered_tier3`, `lenient_proto_version`, the FCS outcomes, and the per-band counts are *sub-classifications* of frames that did reach `mgmt`/`data`/`ctrl` -- they overlap those buckets and are not separate terminal states. + + **This identity is enforced, not just asserted.** `Stats::packets_accounted()` sums the eight terminal buckets; the banner prints `packets unaccounted (BUG; report this)` (= `total_packets - accounted`) and `frames multi-counted (BUG; report this)` (= `accounted - total_packets`) -- both 0 on every correct run, so neither renders. The pipeline `run()` debug-asserts the equality, and `tests/integration/generated_corpus.rs::packet_accounting_holds_across_generated_corpus` drives the whole fixture corpus through the binary and fails if either BUG row appears. A future silent `continue` that drops a packet without a counter therefore cannot pass the test suite. + +2. **Management subtype accounting (Phase 3).** `mgmt_frames` equals the sum of all named management subtype counters plus `mgmt_reserved_subtype`. (PMF-protected Action frames are counted in `action_frames` and then short-circuited; see `mgmt_protected_action_skipped`.) + +3. **Pair accounting (Phase 4).** `eapol_pairs_generated = eapol_pairs_useful + dedup_dropped_pairs`. The opt-in filters reduce the candidate set *before* generation, so `pairs_time_filtered` / `pairs_rc_filtered` are reported as their own lines, not folded into the gap. The per-combo `pairs_written_n*` children sum to `eapol_pairs_useful` (the written total), not to the generated total. + +4. **Hash accounting (Phase 5).** `hashes emitted (total)` is the sum over the 11-type table (`hash_type_emitted`). Its `EAPOL hash lines` / `PMKID hash lines` children are the odd-code / even-code halves of that same table, so they always sum to the total. PMKID material that did not reach a sink is accounted by `dedup_dropped_pmkids`, `emit_dropped_unclassified_akm`, `emit_dropped_ft_no_context`, and the Phase-3 garbage/`essid_unresolved` drops. + +## Formatting contract + +- Rows render as `{label:.<60}: {value}` -- dotted leaders to a fixed value column at **W=60**; section headers fill to 70. +- Every label is **at most 58 characters** including indent (so at least two leader dots always render) and contains **no embedded `": "`** (the first `": "` on any row is always the label/value separator, so `awk -F': '` is unambiguous). Both rules are enforced by the `banner_labels_fit_column` unit test and a `debug_assert` in the row macros. +- Indentation (0 / 2 / 4 spaces) expresses hierarchy: children sum to their parent unless a label says `pre-dedup` / `post-dedup`. +- Skeleton rows (`stat!`) always print; everything else (`nz!`) prints only when nonzero, and split rows never print a bare-zero side. +- Values are raw integers, no separators. Wallclock and throughput are the only one-decimal values. + +## Disposition classes + +Every row carries exactly one disposition, signalled in its label suffix and tabulated in the **Disposition** column below: + +| Class | Meaning | Drops a packet/hash? | +|---|---|---| +| **skeleton** | Always-printed structural total. | no | +| **informational** | Something observed; no data impact and not even an anomaly. | no | +| **diagnostic** | An anomaly worth noting; the frame was still processed. | no | +| **recovered** | A problem was worked around and the data was kept. | no | +| **dropped** | A frame, PMKID, or hash line was lost. **The label always says so.** | **yes** | + +--- + +## Phase 1 -- Ingest + +File-container reading and packet integrity. Source specs: pcapng = draft-ietf-opsawg-pcapng-05; pcap magic / DLT = libpcap `sf-pcap.c` / `dlt.h`. + +| Line | Field(s) | Source | Why we care | Disposition | +|---|---|---|---|---| +| input files processed / file name | `input_file_count`, `last_file` | pcapng §4 / pcap header | How many captures were actually opened; single-file runs keep the hcxpcapngtool one-line layout. | skeleton | +| file formats / endians / network types seen | `file_formats_seen`, `endians_seen`, `dlt_descs_seen` | pcap/pcapng headers | Spot one odd file (wrong format/endian/DLT) in a large directory walk. | informational | +| first / last packet (epoch s), duration | `timestamp_first_us`, `timestamp_last_us` | pcapng §4.3 EPB / pcap record ts | Capture span; a near-zero duration on a big file flags a clock problem. | informational | +| bytes ingested (MiB) | `bytes_ingested` | file sizes | Denominator for the Phase 5 throughput row. | informational | +| packets total | `total_packets` | per-record | The packet-accounting denominator (identity 1). | skeleton | +| link/parse errors (frames dropped) | `link_errors` | §9 link-layer | Link-layer strip failed after every recovery tier; the 802.11 frame was never exposed. | **dropped** | +| MAC header malformed (frame dropped) | `malformed_mac_hdr` | §9.2.4.1 | Frame Control / addresses unparseable; cannot dissect. | **dropped** | +| non-zero Protocol Version (forgiven; processed) | `lenient_proto_version` | §9.2.4.1.1 | FC version != 0 (reserved) but the v0 MAC layout still parses (matches tshark); the frame is kept. | diagnostic | +| files with truncated trailing record | `truncated_capture_files`, `unreadable_packets` | FR-IN-10 | A capture ended mid-record; earlier records are kept, the partial tail is lost. | **dropped** (tail) | +| input files skipped (magic unrecognised) | `files_skipped_unknown_format` | pcap magic | A non-capture file (sub-4-byte stub, junk) in the input set; routed to `[skipped_input]`. | **dropped** (file) | +| packets dropped (unknown link type; no IDB) | `packets_unknown_linktype` | pcapng §4.2 | A pcapng EPB referenced an `interface_id` with no preceding IDB, so the DLT is unknown and the packet cannot be decoded. | **dropped** | +| packets dropped (truncated past MAC header) | `truncated_after_header` | snaplen | The MAC header parsed but its `body_offset` ran past the captured bytes (snaplen truncation / corrupt length); the body and any EAPOL/IE it held are gone. | **dropped** | +| packets with zeroed timestamps (informational) | `packets_zeroed_timestamp` | pcapng §4.3 | Capture-tool artifact (no clock); the frame is processed normally. | informational | +| timestamps out of sequence (informational) | `out_of_sequence_timestamps` | per-record ts | A packet's timestamp went strictly backward within a file (mergecap / hand-edit); processing is order-independent. | informational | + +--- + +## Phase 2 -- Decode + +Link-layer strip, FCS resolution, tiered recovery, 802.11 frame-type split, RF context, EAPOL wire mix. Source specs: radiotap.org; IEEE 802.11-2024 §9.2.4 (Frame Control), §9.2.4.7 (FCS), §9.3.2.2.2 (A-MSDU), §9.2.4.4 (fragmentation), §12.7.2 (EAPOL-Key). + +| Line | Field(s) | Source | Why we care | Disposition | +|---|---|---|---|---| +| management / data / control frames | `mgmt_frames`, `data_frames`, `ctrl_frames` | §9.2.4.1.3 Table 9-1 | The frame-type skeleton; mgmt+data feed extraction, control is terminal. | skeleton | +| extension frames (802.11 amendments) | `extension_frames` | Table 9-1 type 3 | Type-3 (DMG/S1G) frames; counted and not body-parsed. | informational | +| packets unaccounted (BUG; report this) | derived (`total_packets - packets_accounted()`) | identity 1 | Self-check: a packet was dropped without a counter. Always 0; a nonzero value is a silent-drop regression. | **dropped** (BUG) | +| frames multi-counted (BUG; report this) | derived (`packets_accounted() - total_packets`) | identity 1 | Self-check: a packet was counted in two terminal buckets. Always 0; nonzero is a double-count regression. | diagnostic (BUG) | +| relay (WDS) frames | `relay_frames` | §9.3.2.1 Table 9-60 | 4-address relay frames carrying EAPOL, deferred to Phase 1.5 for direction resolution -- wpawolf processes WDS unconditionally (invariant 4). | informational | +| WPA / WEP encrypted data frames | `wpa_encrypted_data`, `wep_encrypted_data` | §9.2.4.1.1 B14; §12.5.2.2 / §12.3.4.2 | Protected data frames, split on the KeyID-octet ExtIV bit (1 = TKIP/CCMP/GCMP, 0 = legacy WEP). Capture-quality / cipher-mix signal. | informational | +| PMF-encrypted management frames (802.11w) | `mgmt_protected_frames` | §11.13 | Encrypted Action/mgmt bodies; we have no PTK to decrypt them. | informational | +| Action body dropped (PMF; FT/Mesh PMKIDs unavailable) | `mgmt_protected_action_skipped` | §11.13 | A PMF-protected Action frame whose body (possibly an FT/Mesh PMKID) we cannot read. | **dropped** (body) | +| A-MSDU aggregated Data frames / subframes recovered | `amsdu_frames_seen`, `amsdu_subframes_total` | §9.3.2.2.2 | EAPOL hidden in A-MSDU subframes 2..N would be missed without subframe iteration. | recovered | +| radiotap it_version != 0 (Tier 1 recovered) | `radiotap_version_nonzero` | radiotap.org | A firmware emits non-zero `it_version`; we read `it_len` regardless instead of dropping (1.5M frames in one corpus). | recovered | +| frames recovered via it_present computation (Tier 2) | `recovered_tier2` | radiotap.org | Corrupt `it_len`; header length recomputed from the `it_present` bitmask. | recovered | +| frames recovered via CRC-32 offset scan (Tier 3) | `recovered_tier3` | ISO 3309 CRC-32 | All header fields corrupt; the 802.11 frame located by scanning for the FCS residue `0x2144DF1C`. | recovered | +| FCS stripped (header + CRC-32 agree) | `fcs_header_and_crc_agree` | §9.2.4.7 | Both the radiotap flag and the CRC confirm a trailing FCS; stripped before IE walking. | recovered | +| FCS stripped (CRC-32 detected, header silent) | `fcs_detected_by_crc` | §9.2.4.7 | The header never announced an FCS but the CRC found one; without this those 4 bytes would mis-parse as IE data. | recovered | +| FCS stripped (BADFCS flagged; corrupt on air) | `fcs_badfcs_flagged` | radiotap Flags 0x40 | Radio received the frame with a failed checksum; stripped anyway. | diagnostic | +| FCS stripped (CRC-32 mismatch; no BADFCS flag) | `fcs_crc_mismatch_no_flag` | §9.2.4.7 | Header claimed an FCS, CRC disagreed, no BADFCS flag; trust the header and strip. | diagnostic | +| no FCS present (frame left untouched) | `fcs_neither` | §9.2.4.7 | No trailing FCS; makes the five FCS outcomes account for every frame reaching the resolver. | informational | +| radiotap A-MPDU Status field present | `ampdu_status_frames` | radiotap A-MPDU | A-MPDU aggregation context observed; parser-health signal. | informational | +| fragments buffered for reassembly | `fragment_stats.fragments_seen` | §9.2.4.4 | Non-final MSDU fragments held for out-of-order reassembly. | informational | +| reassembled MSDUs (all fragments present) | `fragment_stats.fragments_reassembled` | §9.2.4.4 | An FT-PSK M2 split across the radio MTU was rebuilt. | recovered | +| incomplete MSDUs (missing fragments in capture) | `fragment_stats.fragments_incomplete` | §9.2.4.4 | Fragments whose siblings never appeared; the MSDU is lost. | **dropped** | +| fragments evicted (safety cap; expect 0) | `fragment_stats.fragments_dropped_safety_cap` | -- | Paranoid 1 M backstop on the in-flight buffer; nonzero means the cap is sized wrong. | **dropped** | +| AWDL frames (Apple AWDL) | `awdl_frames` | Apple AWDL | Apple peer-to-peer traffic; capture-environment signal. | informational | +| on 2.4 / 5 / 6 / other band | `band_24ghz`, `band_5ghz`, `band_6ghz`, `band_other` | radiotap Channel | Band distribution from the radiotap channel field; `band_other` keeps the split accounting for every channel-bearing packet. | informational | +| beacon channels 2.4 / 5-6 GHz | `beacon_channels` | §9.4.2.4 DS Param | Channel histogram from Beacons (DS Parameter Set IE). | informational | +| EAPOL KDV 1 / 2 / 3 / 0 / reserved | `eapol_kdv1`, `eapol_kdv2`, `eapol_kdv3`, `eapol_kdv0`, `eapol_kdv_other` | §12.7.2 Table 12-11 | Key Descriptor Version mix; drives KDV-first AKM reconciliation. KDV 0 is the legitimate "AKM-defined" value for the SHA-384 families, split out so it is not flagged as an anomaly. | informational | +| EAPOL RSN / WPA (legacy) descriptor | `eapol_rsn`, `eapol_wpa` | §12.7.2 | Descriptor-type byte (0x02 RSN vs 0xFE WPA legacy). | informational | + +--- + +## Phase 3 -- Extract + +Store population. The management subtype tree, the ESSID block, plaintext surfaces, the EAPOL store block, and the PMKID S1-S20 sources. + +### Management subtype tree + +Source: IEEE 802.11-2024 §9.4.1 (mgmt body fields), Table 9-1 (subtypes), §9.4.2 (IEs). Each is a per-subtype total or a sub-field of one; together with `mgmt_reserved_subtype` they reconcile to `mgmt_frames` (identity 2). + +| Line | Field(s) | Source | Why we care | Disposition | +|---|---|---|---|---| +| BEACON (total) | `beacon_frames` | §9.3.3.3 | AP presence; SSID/AKM/PMKID source. | informational | +| SSID wildcard / zeroed / oversized | `beacon_ssid_wildcard`, `beacon_ssid_zeroed`, `beacon_ssid_oversized` | §9.4.2.2 | Hidden/blanked/malformed SSIDs; oversized SSID is rejected, the beacon kept. | informational / **dropped** (oversized SSID only) | +| RSNXE SAE-H2E / SAE-PK / Secure LTF / Protected TWT | `rsnxe_sae_h2e`, `rsnxe_sae_pk`, `rsnxe_secure_ltf`, `rsnxe_protected_twt` | §9.4.2.241 | WPA3 / 11az / 11ax capability bits; capture-feature signal. | informational | +| RNR blocks / 6 GHz co-located BSSIDs | `rnr_blocks_parsed`, `rnr_6ghz_colocated` | §9.4.2.170 | Reduced Neighbor Report; 6 GHz co-location discovery. | informational | +| Multi-Link Elements / MLD addrs / groups+SSIDs also keyed under MLD | `mle_basic_seen`, `mle_mld_addrs_learned`, `mld_groups_merged`, `essid_link_macs_merged` | §9.4.2.321, §35.3 | 802.11be MLO: each link-keyed handshake / SSID also gets an MLD-keyed copy (the link form is kept). A multi-link handshake cracks under the MLD MAC; a single-link association to one BSSID of an MLD cracks under the link MAC -- both are emitted so the crackable one is always present. | recovered | +| PROBE RESPONSE (total) / SSID unset / zeroed | `probe_resp_frames`, `probe_resp_ssid_unset`, `probe_resp_ssid_zeroed` | §9.3.3.10 / §9.4.2.2 | Probe Responses answer directed probes; an empty/zeroed SSID there is a capture-quality signal. | informational | +| PROBE REQUEST (undirected / directed) | `probe_req_undirected`, `probe_req_directed` | §9.3.3.9 | Directed probes name an SSID the client has joined before -- wordlist material. | informational | +| ASSOCIATION REQUEST (total) + per-AKM | `assoc_req_frames`, `assoc_req_wpa1`, `assoc_req_wpa2_psk`, `assoc_req_ft_psk`, `assoc_req_ft_psk_sha384`, `assoc_req_psk_sha256`, `assoc_req_psk_sha384`, `assoc_req_sae`, `assoc_req_owe`, `assoc_req_fils`, `assoc_req_pasn`, `assoc_req_enterprise_sha1`, `assoc_req_enterprise_sha256`, `assoc_req_enterprise_sha384`, `assoc_req_tdls`, `assoc_req_appeerkey`, `assoc_req_akm_unknown` | §9.3.3.6, §9.4.2.24 Table 9-190 | Which AKM suites clients negotiated; tells the operator whether a capture even contains crackable PSK/FT-PSK material. | informational | +| ASSOCIATION RESPONSE (total) | `assoc_resp_frames` | §9.3.3.7 | AP-side association acceptance count. | informational | +| REASSOCIATION REQUEST (total) + per-AKM | `reassoc_req_frames`, `reassoc_req_*` (same AKM set as `assoc_req_*`) | §9.3.3.8, Table 9-190 | Roaming clients; FT-PSK reassociations are the FT crack source. | informational | +| REASSOCIATION RESPONSE (total) | `reassoc_resp_frames` | §9.3.3.9 | AP-side reassociation acceptance count. | informational | +| AUTHENTICATION (total) + per-algorithm | `auth_frames`, `auth_open_system`, `auth_shared_key`, `auth_fbt`, `auth_sae`, `auth_fils`, `auth_network_eap`, `auth_pasn` | §9.4.1.1 Table 9-43 | Authentication algorithm mix; FBT/SAE/FILS presence shapes what hashes are possible. | informational | +| FT status 52 / 53 / 54 / 55 | `ft_status_r0kh_unreachable`, `ft_status_invalid_pmkid`, `ft_status_invalid_mde`, `ft_status_invalid_fte` | §9.4.1.9 Table 9-92 | Each promoted FT failure status explains a *missing* FT-PSK handshake: the AP refused the FT auth, so no M2/M3 followed. | diagnostic | +| DEAUTHENTICATION (total) | `deauth_frames` | §9.3.3.12 | Session teardown volume. | informational | +| MIC failure, reason 14 | `mic_failure_deauths` | §9.4.1.7 Table 9-90 | The canonical "this handshake will never pair cleanly" signal for a session. | diagnostic | +| DISASSOCIATION (total) | `disassoc_frames` | §9.3.3.11 | Session-end volume. | informational | +| ACTION (total) + children | `action_frames`, `action_nr_req_ssids`, `fils_discovery_ssids`, `action_ft_frames`, `action_mesh_peering`, `anqp_gas_frames`, `anqp_venue_name`, `anqp_domain_name`, `anqp_nai_realm`, `anqp_hs_operator_friendly_name`, `anqp_unknown_info_id` | §9.6, §9.4.5 | Action frames carry FT/Mesh PMKIDs, FILS Discovery / NR-request SSIDs, and ANQP plaintext (venue, domain, realm, operator name) -- all extraction surfaces. | informational | +| ANQP fragmented (dropped) | `anqp_fragmented_skipped` | §9.4.5 | A fragmented ANQP element; reassembly is not implemented, so its plaintext is lost. | **dropped** (element) | +| ACTION NO ACK / ATIM / MEASUREMENT PILOT / TIMING ADVERTISEMENT | `action_no_ack_frames`, `atim_frames`, `measurement_pilot_frames`, `timing_advert_frames` | Table 9-1 | Remaining management subtypes, counted for completeness. | informational | +| RESERVED subtype (7/15) | `mgmt_reserved_subtype` | Table 9-1 | Reserved management subtypes; counted so the subtype rows reconcile to `mgmt_frames` (identity 2). | diagnostic | + +### ESSID and plaintext surfaces + +Source: §9.4.2 (IEs), vendor specs (WPS, OWE, CCX, P2P), RFC 3748 (EAP identity). A regression that drops one of these is an `extraction_coverage.rs` failure. + +| Line | Field(s) | Source | Why we care | Disposition | +|---|---|---|---|---| +| ESSID (unique APs seen) | `essid_count` | §9.4.2.2 | How many distinct APs have a known SSID (needed for PMK derivation). | skeleton | +| ESSID changes (per-AP maximum) | `essid_changes_max` | §9.4.2.2 | Largest per-AP SSID-variant count; a high value usually means RF-rotted duplicate beacons. | informational | +| hash lines dropped (no SSID resolved) / distinct APs affected | `essid_unresolved_emissions`, `essid_unresolved_aps` | FR-ESSID-3 | Hashes dropped at emit because the AP's SSID was never seen -- uncrackable. Each AP also gets an `[essid_not_found_summary]` log line. | **dropped** | +| SSID List / Country / Mesh ID / WPS / Vendor AP / OWE / CCX1 / Time Zone / Multiple-BSSID / RNR BSSIDs / P2P | `ssid_list_entries`, `country_codes_extracted`, `mesh_ids_extracted`, `wps_probe_req_extracted`, `vendor_ap_names_extracted`, `owe_transition_ssids`, `ccx1_ap_names_extracted`, `time_zones_extracted`, `multiple_bssid_profiles`, `rnr_bssids_extracted`, `p2p_device_names_extracted` | §9.4.2.71/.9/.97/.170, WPS/OWE/CCX/P2P vendor specs | Plaintext surfaces feeding `-W` / `-E` / `-R` -- wordlist material from IEs and vendor elements. | informational | +| Wordlist IE-scan runs inserted | `wordlist_scan_ie_runs` | -- | Printable-ASCII runs swept from IE bodies for the `--wordlist-scan` delta output. | informational | +| EAP identities / usernames extracted | `identities_extracted`, `usernames_extracted` | RFC 3748 §5.1 | EAP identity / inner-method peer-identity strings; printed even when `-I`/`-U` are not configured. | informational | + +### EAPOL store block + +Source: IEEE 802.11-2024 §12.7 (4-way handshake, EAPOL-Key), §12.3.2 (preauth EtherType), §9.2.4.8.3 (Mesh Control); RFC 3748 §4.2 (EAP outcome). + +| Line | Field(s) | Source | Why we care | Disposition | +|---|---|---|---|---| +| M1 / M2 / M3 / M4 messages + auth-len max | `eapol_m1`, `eapol_m2`, `eapol_m3`, `eapol_m4`, `m1_auth_len_max`, `m2_auth_len_max`, `m3_auth_len_max`, `m4_auth_len_max` | §12.7.6 Table 12-10 | The raw handshake-message inventory; the auth-len-max rows mirror hcxpcapngtool's `body / frame` widths. | skeleton | +| NULL / 0xFF / repeating-pattern nonce rejected (+ on-M4 split) | `null_nonce_rejected`, `null_nonce_rejected_on_m4`, `ff_nonce_rejected`, `ff_nonce_rejected_on_m4`, `repeat_nonce_rejected`, `repeat_nonce_rejected_on_m4` | §12.7.6.5 NOTE 9 | Garbage Key Nonce: an EAPOL line built from it cannot crack. The on-M4 split separates the spec-zero expected case from an abnormal nonce on M1/M2/M3. | **dropped** | +| NULL / 0xFF / repeating-pattern MIC rejected | `null_mic_rejected`, `ff_mic_rejected`, `repeat_mic_rejected` | §12.7.2 | Garbage Key MIC (M2/M3/M4); the line cannot crack. | **dropped** | +| NULL / 0xFF / repeating-pattern PMKID rejected | `null_pmkid_rejected`, `ff_pmkid_rejected`, `repeat_pmkid_rejected` | §12.7.1.3 | Garbage PMKID; not crackable material. | **dropped** | +| ESSID control bytes (informational; shipped unchanged) | `essid_control_bytes_warned` | §9.4.2.2 | SSID with a `0x00..=0x1F` byte; valid on the wire, shipped to hashcat unchanged -- NOT a drop or a transformation. | informational | +| session time gap max | `eapol_time_gap_max_us` | §12.7.6 | Largest gap between paired messages; prints in ms, or us when sub-millisecond. | informational | +| ANonce M1/M3 mismatch sessions | `anonce_m1_m3_mismatch_sessions` | §12.7.6.4 | M1 and M3 ANonce differ (retransmit/PMK-cache/mid-capture start); both anchors are still emitted. | diagnostic | +| EAPOL classified by direction (Tier 1) + WDS tiers | `eapol_tier1_direction`, `eapol_tier1b_essid`, `eapol_tier2_ack_discovery`, `eapol_tier3_flag_fallback` | §9.3.2.1 Table 9-60 | How each EAPOL frame's (AP, STA) direction was resolved; the WDS tiers recover relay-frame handshakes. | recovered | +| direction/ACK mismatches (diagnostic; still paired) | `eapol_ack_mismatches` | §12.7.2 | MAC-header direction disagrees with the Key ACK bit; direction is authoritative, the pair is kept. | diagnostic | +| preauthentication frames (EtherType 0x88C7) | `eapol_preauth_frames` | §12.3.2 | Inter-AP preauth EAPOL, parsed on the same path as 0x888E. | informational | +| LLC accepted but EAPOL parse rejected (frame dropped) | `eapol_llc_invalid` | §12.7.2 | The LLC/packet-type gate said EAPOL-Key but the parser bailed (truncation, bad descriptor/KDV, sentinel nonce/MIC). | **dropped** | +| Mesh Data frames recovered (Mesh Control unwrapped) | `mesh_control_frames` | §9.2.4.8.3 | Mesh BSS data frame whose Mesh Control header was skipped to expose the inner MSDU. | recovered | +| Mesh Data dropped (bad Mesh Control header) | `mesh_control_malformed` | §9.2.4.8.3 | Mesh Control header with a reserved Address Extension Mode (11) or a too-short body; the inner MSDU is unrecoverable. | **dropped** | +| EAP-Success / EAP-Failure frames | `eap_success_frames`, `eap_failure_frames` | RFC 3748 §4.2 | Terminal EAP outcomes; capture-quality triage for mixed PSK/Enterprise traffic. | informational | + +### PMKID sources (S1-S20) + +Source: IEEE 802.11-2024 §12.7.1.3 (PMKID), §12.7.2 (M1/M2 KDE), §9.6.7 (FT Action), Wi-Fi Passpoint (OSEN). `pmkids_found` is the insertion total (post-garbage, post-insert-dedup, pre-global-dedup); the children split it by extraction site and by AKM family. + +| Line | Field(s) | Source | Why we care | Disposition | +|---|---|---|---|---| +| PMKID store insertions (total, pre-dedup) | `pmkids_found` | §12.7.1.3 | The PMKID material inventory before global dedup. | skeleton | +| per-source S1-S20 | `pmkid_m1`, `pmkid_m2`, `pmkid_assoc_req`, `pmkid_reassoc_req`, `pmkid_ft_auth`, `pmkid_fils_auth`, `pmkid_pasn_auth`, `pmkid_ft_action`, `pmkid_probe_req`, `pmkid_beacon`, `pmkid_probe_resp`, `pmkid_mesh`, `pmkid_osen` | §12.7.2, §9.6.7, OSEN | Which of the 20 spec-defined locations each PMKID came from -- the "never miss a PMKID" coverage map. | informational | +| by AKM family (non-FT / FT) | `pmkid_wpa2_psk`, `pmkid_ft_psk` | §12.7.1.3 | Same total split by AKM family (non-FT = PSK/SHA256/SHA384; FT = FT-PSK/FT-PSK-SHA384). | informational | + +--- + +## Phase 4 -- Emit + +Pairing, classification, dedup, and the per-sink output rows. Source: ARCHITECTURE.md §2 (11-type table), §5 (combos), §7 (line format), §8 (FR-PAIR / FR-OUT). + +| Line | Field(s) | Source | Why we care | Disposition | +|---|---|---|---|---| +| output filters active | `filters_active` | FR-CLI-3 | Echoes the resolved filter state so a WIDE run and a `--strict` run are distinguishable from the banner alone. | informational | +| per-hash-type lines emitted (11 rows) | `hash_type_emitted` | §2 | One row per 11-type code -- exactly what hashcat will see, type by type. | skeleton | +| EAPOL pairs generated (total, pre-dedup) | `eapol_pairs_generated` | §5 | Pairs the engine produced before global dedup (identity 3). | skeleton | +| EAPOL pairs written (post-dedup) | `eapol_pairs_useful` | §5 | Pairs that survived dedup; the combo children sum to this. | skeleton | +| per-combo written (N1E2 / N3E2 / N1E4 / N2E3 / N4E3 / N3E4) | `pairs_written_n1e2`, `pairs_written_n3e2`, `pairs_written_n1e4`, `pairs_written_n2e3`, `pairs_written_n4e3`, `pairs_written_n3e4` | §5 | Which N#E# combos produced output (AP-less N2E3/N4E3 included). | informational | +| NC / LE / BE flag set | `pairs_nc`, `pairs_le`, `pairs_be` | hcxtools mp byte | Hints passed to hashcat's nonce-error-corrections / endianness handling. | informational | +| NC-dedup lines collapsed / cluster count / max cluster | `nc_dedup_collapsed_lines`, `nc_dedup_cluster_count`, `nc_dedup_max_cluster_size` | §5.8.1 | `--nc-dedup` folded near-identical-nonce siblings into one survivor. | informational (lines folded, not lost) | +| candidates dropped (--eapoltimeout filter) | `pairs_time_filtered` | FR-PAIR-3 | Opt-in session-window filter removed these candidate pairs (zero in WIDE mode). | **dropped** (filter) | +| candidates dropped (--rc-drift filter) | `pairs_rc_filtered` | FR-PAIR-4 | Opt-in replay-counter filter removed these candidate pairs (zero in WIDE mode). | **dropped** (filter) | +| RC gap max | `rc_gap_max`, `rc_drift_enabled` | §5.7 | Largest RC gap among written pairs; suggests an NC threshold. | informational | +| PMKIDs written (post-dedup) | `pmkids_written` | §6 | PMKID hashes that survived dedup at least once. | skeleton | +| dedup dropped (total) + EAPOL/PMKID children | `dedup_dropped`, `dedup_dropped_pairs`, `dedup_dropped_pmkids` | FR-DEDUP | Duplicate hash lines suppressed by global SipHash dedup, split by kind so the pre-dedup totals reconcile. | **dropped** (duplicates) | +| hashes dropped (unclassified AKM; no 11-type) | `emit_dropped_unclassified_akm` | §2 | Extracted crack material whose AKM maps to none of the 11 types, even after AKM-map inference; cannot be formatted. | **dropped** | +| hashes dropped (FT context missing; no R0KH-ID) | `emit_dropped_ft_no_context` | §7 | An FT hash (types 6/7/10/11) with no R0KH-ID, so the `WPA*03*`/`WPA*04*` line cannot be built. | **dropped** | +| per hash sink (path + lines written + dedup dropped) | `path_*` family, `lines_*` family, `dropped_*` family | §2, §7 | Each configured hash sink's file, lines written, and per-sink dedup drops. Unconfigured sinks collapse to a count. | informational / **dropped** (per-sink dedup) | +| hash sinks not configured | (derived) | -- | How many of the 9 hash sinks were not requested. | informational | +| per aux sink (path + entries written) | `essid_list_path`+`entries_essid_list`, `probe_list_path`+`entries_probe_list`, `wordlist_path`+`entries_wordlist`, `wordlist_scan_path`+`entries_wordlist_scan`, `identity_list_path`+`entries_identity_list`, `username_list_path`+`entries_username_list`, `device_info_path`+`entries_device_info` | FR-CLI-2 | Each configured auxiliary sink's file and entry count (parity with the hash sinks' line counts). | informational | +| auxiliary sinks not configured | (derived) | -- | How many of the 7 auxiliary sinks were not requested. | informational | + +--- + +## Phase 5 -- Report + +Executive summary an operator reads in five seconds. + +| Line | Field(s) | Source | Why we care | Disposition | +|---|---|---|---|---| +| hashes emitted (total) + EAPOL/PMKID split | `hash_type_emitted` (summed) | §2 | The headline yield, split by attack surface (identity 4). | skeleton | +| distinct hash types observed | `hash_type_emitted` (nonzero count) | §2 | How many of the 11 types this capture produced. | skeleton | +| wallclock Phase 1-3 / Phase 4 / total | `wallclock_p13_ms`, `wallclock_p4_ms` | -- | Where the time went (streaming pass vs pairing+emit). | informational | +| throughput (MiB/s) | `bytes_ingested`, `wallclock_p13_ms` | -- | Ingest rate against the FR-PERF-1 target. | informational | +| peak RSS (MiB) | `peak_rss_mib` | -- | High-water memory (lower bound, sampled at the pressure-check cadence). | informational | +| disk-backed fallback engaged | `disk_mode_engaged` | invariant 2 | "yes" means RSS hit the threshold and the run degraded to disk speed instead of aborting. | informational | +| hint (no hashes) | (derived from the largest drop counter) | -- | On a zero-hash run, names the single largest drop so the operator knows where to look first. | diagnostic | + +--- + +## Fields not tied to a single banner line + +These back the banner indirectly (mirrors, scratch, per-sink arrays) and are listed here so the contract names every field: + +- **Per-sink arrays** (rendered by the sink loops above; the `audit-stats` gate exempts these prefixes since they are documented per family, not per member): `lines_22000`/`lines_37100`/`lines_combined`/`lines_wpa1`/`lines_wpa2`/`lines_psk_sha256`/`lines_ft`/`lines_psk_sha384`/`lines_ft_psk_sha384` (`lines_*`); the matching `dropped_*` set; the matching `path_*` set; and the full `reassoc_req_*` per-AKM set (mirror of `assoc_req_*`). +- **Composite field:** `fragment_stats` -- the `Stats` field of type `FragmentStats` whose members (`fragments_seen` etc.) are documented in the Phase 2 table above. +- **Scratch / derived state, not printed directly:** `eapol_last_seen` -- the per-(AP,STA) timestamp map used to compute `eapol_time_gap_max_us`. + +## hcxpcapngtool parity and exclusions + +wpawolf's banner is a content-superset of every **PSK-relevant** row hcxpcapngtool prints, with three structural differences (by design): + +1. hcxpcapngtool prints one summary per input file and resets between files; wpawolf prints one aggregate banner per run (collect-then-pair spans the whole input set). +2. hcxpcapngtool appends `Warning:` / `Information:` advice paragraphs; wpawolf adopts only the single-line zero-hash `hint`. +3. hcxpcapngtool's malformed-packet sub-breakdown (BEACON/broadcast-MAC/IE-tag/ESSID errors) lives in wpawolf's aggregated `--log` categories, not banner rows; the banner keeps the single `malformed_mac_hdr` count. + +**Rows hcxpcapngtool prints that wpawolf intentionally excludes:** `skipped packets` and every `(use --all)` variant (wpawolf processes everything); zeroed-PSK / zeroed-PMK / ROGUE rows (superseded by the garbage-pattern gate); hccap / hccapx / JtR sink rows (legacy formats out of scope); EAP-MD5/LEAP/MSCHAPv2 pair rows, PPP-CHAP/PAP, TACACS+, RADIUS (v2 -- planned, not implemented); the NMEA/GPS block (v2); pwnagotchi / hcxhash2cap beacon fingerprints; FILS-PFS/PK/EPPKE auth sub-variants (folded into `auth_fils` / `auth_pasn`); `RESERVED MANAGEMENT frame` advisory wording; the `--max-essids` advisory; per-message `oversized` rows (no size gate exists to overflow); IP / transport / cipher-suite / RSN-capabilities histograms (no PSK relevance -- PMF presence is observable via `mgmt_protected_frames`). + +The Phase 8 superset test (`tests/integration/superset_test.rs`) enforces output-line parity at every release; `make audit-stats` enforces this catalogue against `src/stats.rs`. diff --git a/src/extract/beacon.rs b/src/extract/beacon.rs index 5e461d1..4786eaf 100644 --- a/src/extract/beacon.rs +++ b/src/extract/beacon.rs @@ -65,7 +65,12 @@ pub fn process_beacon_or_probe_resp( // Extract SSID from IE id=0. [IEEE 802.11-2024] §9.4.2.3 for ie in iter_ies(ies) { if ie.id == 0 { - // Beacon SSID quality counters (hidden/zeroed/malformed). + // SSID quality counters (hidden/zeroed/malformed), split per + // subtype: Beacons keep their three-way classification; Probe + // Responses get the unset/zeroed pair for hcxpcapngtool parity + // (a Probe Response answering a directed probe should never + // carry a wildcard SSID -- when it does, that is a capture + // quality signal worth surfacing). if mac_hdr.subtype == SUBTYPE_BEACON { if ie.value.is_empty() { stats.beacon_ssid_wildcard += 1; @@ -74,6 +79,10 @@ pub fn process_beacon_or_probe_resp( } else if ie.value.iter().all(|&b| b == 0) { stats.beacon_ssid_zeroed += 1; } + } else if ie.value.is_empty() { + stats.probe_resp_ssid_unset += 1; + } else if ie.value.len() <= 32 && ie.value.iter().all(|&b| b == 0) { + stats.probe_resp_ssid_zeroed += 1; } if !ie.value.is_empty() { insert_essid(essid_map, mac_hdr.ap, ie.value, timestamp_us, stats, logger); diff --git a/src/extract/data.rs b/src/extract/data.rs index 15a150c..ca2ff3d 100644 --- a/src/extract/data.rs +++ b/src/extract/data.rs @@ -39,10 +39,21 @@ pub fn process_data( fragment_store: &mut FragmentStore, logger: &mut Logger, ) { - // Count WPA/WEP encrypted data frames (Protected Frame bit set). + // Count encrypted data frames (Protected Frame bit set), split WEP vs + // WPA-family on the ExtIV bit: byte 3 of the protected body is the KeyID + // octet, and bit 5 (0x20) is Extended IV -- 0 for WEP ([IEEE 802.11-2024] + // §12.3.4.2), 1 for TKIP/CCMP/GCMP ([IEEE 802.11-2024] §12.5.2.2). Bodies + // too short to carry the KeyID octet stay in the WPA bucket (wire-bit + // driven, no guessing beyond the flag-defined octet). // [IEEE 802.11-2024] §9.2.4.1.1 bit B14 if mac_hdr.protected { - stats.wpa_encrypted_data += 1; + if let Some(&key_id_byte) = body.get(3) + && key_id_byte & 0x20 == 0 + { + stats.wep_encrypted_data += 1; + } else { + stats.wpa_encrypted_data += 1; + } } // --- 802.11 MSDU fragmentation reassembly --- @@ -122,7 +133,13 @@ pub fn process_data( stats.mesh_control_frames += 1; Some(body.get(n..).unwrap_or(&[]).to_vec()) }, - _ => None, + // Reserved Address Extension Mode (11) or a body shorter than the + // Mesh Control header: the inner MSDU is unrecoverable. Counted as a + // drop instead of vanishing silently. [IEEE 802.11-2024] §9.2.4.8.3 + _ => { + stats.mesh_control_malformed += 1; + None + }, } } else { None diff --git a/src/extract/mgmt.rs b/src/extract/mgmt.rs index bc72627..acd81d5 100644 --- a/src/extract/mgmt.rs +++ b/src/extract/mgmt.rs @@ -237,6 +237,21 @@ pub fn process_mgmt( let Some(&seq_hi) = body.get(3) else { return }; let seq = u16::from_le_bytes([seq_lo, seq_hi]); + // Status Code: LE u16 at body[4..6]. [IEEE 802.11-2024] §9.3.3.11, + // §9.4.1.9 Table 9-92. The four FT failure codes are promoted to + // their own counters because each one explains a missing FT-PSK + // handshake: the AP refused the FT authentication, so no M2/M3 + // ever followed. + if let (Some(&st_lo), Some(&st_hi)) = (body.get(4), body.get(5)) { + match u16::from_le_bytes([st_lo, st_hi]) { + 52 => stats.ft_status_r0kh_unreachable += 1, // Table 9-92 status 52 + 53 => stats.ft_status_invalid_pmkid += 1, // Table 9-92 status 53 + 54 => stats.ft_status_invalid_mde += 1, // Table 9-92 status 54 + 55 => stats.ft_status_invalid_fte += 1, // Table 9-92 status 55 + _ => {}, + } + } + // PMKID extraction dispatched by algorithm number. match algo { 2 => process_auth_ft(mac_hdr, seq, body, timestamp_us, pmkid_store, akm_map, stats, logger), @@ -250,6 +265,15 @@ pub fn process_mgmt( }, SUBTYPE_DEAUTH => { stats.deauth_frames += 1; + // Reason Code: LE u16 at body[0..2]. [IEEE 802.11-2024] §9.3.3.13, + // §9.4.1.7 Table 9-90. Reason 14 (Message integrity code failure) + // is promoted to its own counter: it is the canonical "this + // handshake will never pair cleanly" signal for the session. + if let (Some(&r_lo), Some(&r_hi)) = (body.first(), body.get(1)) + && u16::from_le_bytes([r_lo, r_hi]) == 14 + { + stats.mic_failure_deauths += 1; + } }, SUBTYPE_ACTION_NO_ACK => { stats.action_no_ack_frames += 1; @@ -257,7 +281,9 @@ pub fn process_mgmt( SUBTYPE_TIMING_ADVERT => { stats.timing_advert_frames += 1; }, - _ => {}, // subtypes 7 (reserved) and >15 + // Reserved management subtypes (7, 15) per [IEEE 802.11-2024] Table 9-1. + // Counted so the per-subtype rows reconcile against `mgmt_frames`. + _ => stats.mgmt_reserved_subtype += 1, } } diff --git a/src/input/mod.rs b/src/input/mod.rs index c6742e6..1ee6ffd 100644 --- a/src/input/mod.rs +++ b/src/input/mod.rs @@ -107,10 +107,12 @@ pub struct FileMetadata { pub const fn dlt_name(dlt: u16) -> &'static str { match dlt { 105 => "DLT_IEEE802_11", + 113 => "DLT_LINUX_SLL", 127 => "DLT_IEEE802_11_RADIO", 119 => "DLT_PRISM_HEADER", 163 => "DLT_IEEE802_11_RADIO_AVS", 192 => "DLT_PPI", + 276 => "DLT_LINUX_SLL2", _ => "DLT_UNKNOWN", } } diff --git a/src/log.rs b/src/log.rs index 891196e..2ffc4cd 100644 --- a/src/log.rs +++ b/src/log.rs @@ -4,7 +4,7 @@ //! **triage tool**: it records events where wpawolf dropped, skipped, or rejected //! data for non-obvious reasons. Obvious high-volume rejections (null PMKID, null //! M4 nonce, out-of-sequence timestamps) are already counted in the stats banner -//! on stderr and do NOT appear in the log. +//! on stdout and do NOT appear in the log. //! //! ## Per-event categories (written immediately, low volume) //! @@ -41,8 +41,8 @@ //! //! ## Line format //! -//! `[category] key=value key=value ...`. Per-event categories lead with -//! `timestamp_us` when frame context is available. MAC addresses are bare +//! `[category] key=value key=value ...`. Per-frame categories carry `file=` and +//! `frame=` context from the stored Logger state. MAC addresses are bare //! lowercase hex (12 chars, no separators). Hex byte fields use `render_lower_hex` //! (contiguous lowercase, no separators). diff --git a/src/main.rs b/src/main.rs index 2cbcf3e..09782b9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,8 +1,9 @@ //! Shared -- binary entry point and Phase 1-5 orchestrator. See ARCHITECTURE.md §3. //! -//! Parses command-line arguments via `clap`, then runs the two-phase pipeline: -//! Phase 1 collects all EAPOL messages and PMKIDs from every input file into in-memory -//! stores; Phase 2 pairs messages and writes output files. See `ARCHITECTURE.md §3`. +//! Parses command-line arguments via `clap`, then runs the collect-then-pair pipeline: +//! Phases 1-3 collect all EAPOL messages and PMKIDs from every input file into the +//! stores; Phase 4 pairs messages and writes output files; Phase 5 prints the summary. +//! See `ARCHITECTURE.md §3`. //! //! Unfiltered by default: all 6 N#E# combinations, unlimited session window, no //! replay-counter check. Add output filter flags (`--rc-drift`, `--dedup-hash-combos`) to @@ -157,7 +158,7 @@ struct Cli { /// Write structured processing log /// - /// Eleven log categories: malformed_frame, plcp_error, unknown_linktype, unknown_akm, essid_not_found_summary, capture_read_error, skipped_input, invalid_nonce, invalid_mic, invalid_pmkid, essid_control_bytes. Each line carries the rejected bytes in hex for forensic grep. + /// Twelve log categories: eight per-event (eapol_key_rejected, invalid_nonce, invalid_mic, invalid_pmkid, unknown_linktype, capture_read_error, skipped_input, essid_not_found_summary) carrying file=/frame= context and rejected bytes in hex for forensic grep, plus four aggregated end-of-run summaries (plcp_error, malformed_frame, essid_control_bytes, unknown_akm). #[arg(short = 'l', long, value_name = "FILE", value_hint = clap::ValueHint::FilePath, help_heading = "Auxiliary output", display_order = 17)] log: Option, @@ -225,7 +226,7 @@ struct Cli { // ---- Runtime ---- /// Number of pairing threads [default: CPU count] /// - /// Phase 4 worker count. Groups are assigned via LPT scheduling. Use --threads=1 for reproducible single-threaded output. + /// Phase 4 worker count. Groups are paired via rayon work-stealing. Use --threads=1 for the serial path. #[arg(short = 't', long, value_name = "N", help_heading = "Runtime", display_order = 30)] threads: Option, @@ -395,6 +396,10 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { // --- Debug printer (created once; no-op when --debug is off) --- let debug = DebugPrinter::new(cli.debug); + // Wallclock anchors for the Phase 5 cost block. Phases 1-3 are one streaming + // pass over the input set; Phase 4 is the pairing + emit pass. + let run_start = std::time::Instant::now(); + // --- Initialise stores --- let mut message_store = MessageStore::new(); let mut pmkid_store = PmkidStore::new(); @@ -412,14 +417,14 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { let mut stats = Stats::new(); let mut logger = Logger::new(cli.log.as_deref())?; let mut pending_eapol: Vec = Vec::new(); - // Periodic stderr progress lines during Phase 1. On by default; `--quiet` + // Periodic stdout progress lines during Phase 1. On by default; `--quiet` // suppresses entirely. The closing stats banner is unaffected. See // `wpawolf::progress`. let mut progress = ProgressReporter::new(!cli.quiet); // Per-frame extraction toggles derived from the CLI output flags. See // `wpawolf::extract::ExtractConfig`. `scan_ies` is independent of `-W`: - // `--wordlist-scan-ies FILE` populates a dedicated `WordlistScanIesStore`, + // `--wordlist-scan FILE` populates a dedicated `WordlistScanIesStore`, // not the curated `-W` wordlist. let extract_cfg = ExtractConfig { populate_wordlist: cli.wordlist_output.is_some(), @@ -523,6 +528,7 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { // every input file so a directory walk can report a count + format / // endian / DLT distribution rather than only the last file's values. stats.input_file_count += 1; + stats.bytes_ingested += file_size; let meta = reader.file_metadata(); // Save before meta fields are moved into stats HashMaps below. let file_fmt = meta.format.clone(); @@ -558,6 +564,10 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { } } // Timestamp range (epoch microseconds). Initialise first_us on the very first packet. + // A zero timestamp is a capture-tool artifact (counted, frame still processed). + if packet.timestamp_us == 0 { + stats.packets_zeroed_timestamp += 1; + } if stats.timestamp_first_us == 0 && packet.timestamp_us > 0 { stats.timestamp_first_us = packet.timestamp_us; } @@ -576,6 +586,7 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { // Get the DLT for this interface. let Some(dlt) = reader.link_type(packet.interface_id) else { + stats.packets_unknown_linktype += 1; logger.log_unknown_linktype(packet.interface_id); continue; }; @@ -588,7 +599,7 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { 2412..=2484 => stats.band_24ghz += 1, 5180..=5825 => stats.band_5ghz += 1, 5925..=7125 => stats.band_6ghz += 1, - _ => {}, + _ => stats.band_other += 1, } } @@ -634,8 +645,12 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { _ => continue, // unreachable: parse() returns one of the four types } - // Slice the frame body (past MAC header). + // Slice the frame body (past MAC header). The header parsed + // but the captured frame is shorter than its claimed header + // length (snaplen truncation or corrupt length) -- the body, + // and any EAPOL/IE it carried, is unrecoverable. let Some(body) = frame_data.get(mac_hdr.body_offset..) else { + stats.truncated_after_header += 1; continue; }; @@ -740,6 +755,19 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { progress.print_now(stats.total_packets, stats.input_file_count, eapol_total, stats.pmkids_found); } + // Packet-accounting invariant (STATS.md identity 1): every packet in + // `total_packets` reached exactly one terminal disposition in the loop above. + // A future silent `continue` that drops a packet without a counter, or a + // double-count, breaks this. The release banner surfaces it as a BUG row; + // here it hard-fails the test suite (debug builds) before it can ship. + debug_assert_eq!( + stats.packets_accounted(), + stats.total_packets, + "packet accounting broken (STATS.md identity 1): {} accounted vs {} total", + stats.packets_accounted(), + stats.total_packets + ); + // --- Phase 1.5: Resolve deferred WDS EAPOL frames --- // WDS relay frames had ambiguous direction during Phase 1. Now that essid_map is fully // populated, resolve them using essid_map lookup, ACK-based AP discovery, or flag fallback. @@ -761,8 +789,29 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { // ap_count() returns usize; u64 can represent every possible usize value on supported platforms. { stats.essid_count = essid_map.ap_count() as u64; + // "Changes" = distinct SSID variants per AP minus the initial one, maximum across APs. + stats.essid_changes_max = essid_map.max_ssid_variants().saturating_sub(1) as u64; } + // Echo of the resolved output-filter state for the Phase 4 banner, so a WIDE + // run and a --strict run are distinguishable from the banner alone. + stats.filters_active = { + let mut parts: Vec = Vec::new(); + if let Some(s) = cli.eapoltimeout { + parts.push(format!("eapoltimeout={s}")); + } + if let Some(n) = cli.rc_drift { + parts.push(format!("rc-drift={n}")); + } + if cli.dedup_hash_combos { + parts.push("dedup-hash-combos".to_owned()); + } + if cli.nc_dedup { + parts.push(format!("nc-dedup (tolerance {})", cli.nc_tolerance.unwrap_or(8))); + } + if parts.is_empty() { "none (WIDE mode)".to_owned() } else { parts.join(", ") } + }; + // Record output paths in stats so the Phase 4 banner can show configured vs not-configured. let path_str = |p: &Option| p.as_ref().map_or_else(String::new, |p| p.display().to_string()); stats.path_22000 = path_str(&cli.out_22000); @@ -780,15 +829,22 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { stats.identity_list_path = path_str(&cli.identity_output); stats.username_list_path = path_str(&cli.username_output); stats.device_info_path = path_str(&cli.device_output); + stats.wordlist_scan_path = path_str(&cli.wordlist_scan); { - // 802.11be MLD canonicalization: if any Multi-Link Element was seen, rewrite all - // MessageStore and PmkidStore keys so link addresses collapse onto the MLD identity. - // When no MLE was observed, this is a no-op and byte-identical to pre-MLE behavior. - // [IEEE 802.11be] §9.4.2.321 + // 802.11be MLD canonicalization: if any Multi-Link Element was seen, ADD an + // MLD-keyed copy of every handshake / PMKID / SSID whose link address maps to + // an MLD, keeping the original link-keyed form. A true multi-link handshake is + // crackable only under the MLD MAC; a single-link association to one BSSID of + // an MLD is crackable only under the link MAC -- so both are emitted (the same + // "emit every candidate" rule as the six N#E# combos). A destructive rewrite + // would silently drop single-link handshakes onto the MLD MAC, producing an + // uncrackable line (corpus-confirmed regression vs hcxpcapngtool). When no MLE + // was observed this is a no-op, byte-identical to pre-MLE behaviour. + // [IEEE 802.11be] §9.4.2.321, §35.3 if !mld_store.is_empty() { - let merged = message_store.canonicalize_pairs(|m| mld_store.canonicalize(m)); - stats.mld_groups_merged = merged; + let copied = message_store.canonicalize_pairs(|m| mld_store.canonicalize(m)); + stats.mld_groups_merged = copied; pmkid_store.canonicalize_pairs(|m| mld_store.canonicalize(m)); stats.essid_link_macs_merged = essid_map.canonicalize_pairs(|m| mld_store.canonicalize(m)); } @@ -851,6 +907,9 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { debug.phase_start(4, "Emit"); } + // Phases 1-3 (the streaming pass + WDS resolution) end here. + stats.wallclock_p13_ms = u64::try_from(run_start.elapsed().as_millis()).unwrap_or(u64::MAX); + message_store.flush_disk_writer(); pmkid_store.flush_disk_writer(); @@ -881,12 +940,21 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { &mut logger, )?; + // The finalize call above flushed every sink, so the Phase 4 wallclock ends + // here: run total minus the Phase 1-3 share measured before emit started. + stats.wallclock_p4_ms = + u64::try_from(run_start.elapsed().as_millis()).unwrap_or(u64::MAX).saturating_sub(stats.wallclock_p13_ms); + // All usize -> u64: u64 subsumes usize on all supported platforms. { - stats.hashes_written = (output_stats.pmkids_written + output_stats.pairs_written) as u64; - stats.dedup_dropped = output_stats.dedup_dropped as u64; - // Total pairs attempted through dedup = written + dropped. - stats.eapol_pairs_generated = (output_stats.pairs_written + output_stats.dedup_dropped) as u64; + stats.pmkids_written = output_stats.pmkids_written as u64; + stats.dedup_dropped_pairs = output_stats.dedup_dropped_pairs as u64; + stats.dedup_dropped_pmkids = output_stats.dedup_dropped_pmkids as u64; + stats.dedup_dropped = (output_stats.dedup_dropped_pairs + output_stats.dedup_dropped_pmkids) as u64; + stats.emit_dropped_unclassified_akm = output_stats.emit_dropped_unclassified_akm as u64; + stats.emit_dropped_ft_no_context = output_stats.emit_dropped_ft_no_context as u64; + // Total pairs attempted through dedup = written + pair-side drops. + stats.eapol_pairs_generated = (output_stats.pairs_written + output_stats.dedup_dropped_pairs) as u64; // Per-combo and flag counters from the output pipeline. stats.pairs_written_n1e2 = output_stats.n1e2 as u64; @@ -901,6 +969,8 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { stats.nc_dedup_collapsed_lines = output_stats.nc_dedup_collapsed_lines; stats.nc_dedup_cluster_count = output_stats.nc_dedup_cluster_count; stats.nc_dedup_max_cluster_size = output_stats.nc_dedup_max_cluster_size; + stats.pairs_time_filtered = output_stats.pairs_time_filtered; + stats.pairs_rc_filtered = output_stats.pairs_rc_filtered; stats.rc_gap_max = output_stats.rc_gap_max; stats.rc_drift_enabled = cli.rc_drift.is_some(); stats.eapol_pairs_useful = output_stats.pairs_written as u64; @@ -934,12 +1004,32 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { // line via `HashType::from_akm_and_attack`; copy the resulting tally into // the global stats for `print_summary`. stats.hash_type_emitted = output_stats.hash_type_emitted; + + // Auxiliary sink entry counts (filled by `finalize` from the writer returns). + stats.entries_essid_list = output_stats.entries_essid as u64; + stats.entries_probe_list = output_stats.entries_probe as u64; + stats.entries_wordlist = output_stats.entries_wordlist as u64; + stats.entries_wordlist_scan = output_stats.entries_wordlist_scan as u64; + stats.entries_identity_list = output_stats.entries_identity as u64; + stats.entries_username_list = output_stats.entries_username as u64; + stats.entries_device_info = output_stats.entries_device as u64; + + // Extraction-side identity/username tallies -- printed in Phase 3 even + // when the -I / -U sinks are not configured. + stats.identities_extracted = identity_set.len() as u64; + stats.usernames_extracted = username_set.len() as u64; } logger.flush()?; message_store.cleanup_disk(); pmkid_store.cleanup_disk(); stats.fragment_stats.fragments_incomplete = u64::try_from(fragment_store.len()).unwrap_or(u64::MAX); + + // Phase 5 cost block: peak RSS high-water from the memory monitor and the + // sticky disk-mode flag (monitor or either store may have tripped it). + stats.peak_rss_mib = mem_monitor.peak_rss_bytes() / (1024 * 1024); + stats.disk_mode_engaged = mem_monitor.disk_mode() || message_store.disk_mode() || pmkid_store.disk_mode(); + stats.print_summary(); // Optional `--mem-stats` block: per-store byte-count table for OOM triage. @@ -996,7 +1086,7 @@ mod tests { } #[test] - fn strict_alone_enables_all_five_bundled_filters() { + fn strict_alone_enables_all_four_bundled_filters() { let cli = parse_with_strict(&["--strict"]); assert!(cli.strict); assert_eq!(cli.eapoltimeout, Some(5), "--strict -> 5 s session window"); @@ -1030,7 +1120,7 @@ mod tests { let cli = parse_with_strict(&["--strict", "--eapoltimeout=60", "--rc-drift=2"]); assert_eq!(cli.eapoltimeout, Some(60)); assert_eq!(cli.rc_drift, Some(2)); - assert!(cli.dedup_hash_combos, "strict still enables the three boolean filters"); + assert!(cli.dedup_hash_combos, "strict still enables the two boolean filters"); assert!(cli.nc_dedup); } diff --git a/src/mem_monitor.rs b/src/mem_monitor.rs index c73fca8..78826c8 100644 --- a/src/mem_monitor.rs +++ b/src/mem_monitor.rs @@ -29,6 +29,10 @@ pub struct MemMonitor { total_ram: u64, threshold_bytes: u64, last_rss: u64, + /// Highest RSS sample observed across the run (Phase 5 `peak RSS` banner row). + /// Sampled at the pressure-check cadence, so this is a lower bound on the + /// true peak, not an exact high-water mark. + peak_rss: u64, disk_mode: bool, packets_since_check: u64, } @@ -46,7 +50,7 @@ impl MemMonitor { .and_then(|s| s.parse::().ok()) .map_or(THRESHOLD_TENTHS, |pct| pct.min(100) * 10); let threshold_bytes = total_ram / 1000 * tenths; - Self { total_ram, threshold_bytes, last_rss: 0, disk_mode: false, packets_since_check: 0 } + Self { total_ram, threshold_bytes, last_rss: 0, peak_rss: 0, disk_mode: false, packets_since_check: 0 } } /// Probes current RSS and activates disk mode if over threshold. @@ -55,11 +59,13 @@ impl MemMonitor { self.packets_since_check = 0; let rss = progress::current_rss_bytes(); self.last_rss = rss; + self.peak_rss = self.peak_rss.max(rss); if !self.disk_mode && rss >= self.threshold_bytes { self.disk_mode = true; let rss_mib = rss / (1024 * 1024); let total_mib = self.total_ram / (1024 * 1024); - eprintln!( + // stdout per FR-CLI-4: stderr produces no output. + println!( "wpawolf: memory pressure ({rss_mib} MiB / {total_mib} MiB, >= 80%) -- switching to disk-backed mode" ); return true; @@ -83,6 +89,7 @@ impl MemMonitor { pub fn would_exceed(&mut self, additional_bytes: u64) -> bool { let rss = progress::current_rss_bytes(); self.last_rss = rss; + self.peak_rss = self.peak_rss.max(rss); rss.saturating_add(additional_bytes) >= self.threshold_bytes } @@ -93,7 +100,8 @@ impl MemMonitor { self.disk_mode = true; let rss_mib = self.last_rss / (1024 * 1024); let total_mib = self.total_ram / (1024 * 1024); - eprintln!( + // stdout per FR-CLI-4: stderr produces no output. + println!( "wpawolf: preemptive disk mode ({rss_mib} MiB / {total_mib} MiB) -- large allocation would exceed 80%" ); } @@ -116,6 +124,12 @@ impl MemMonitor { pub const fn last_rss(&self) -> u64 { self.last_rss } + + /// Highest RSS sample observed so far in bytes (lower bound on the true peak). + #[must_use] + pub const fn peak_rss_bytes(&self) -> u64 { + self.peak_rss + } } impl Default for MemMonitor { @@ -128,6 +142,7 @@ impl std::fmt::Debug for MemMonitor { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("MemMonitor") .field("total_ram", &self.total_ram) + .field("peak_rss_mib", &(self.peak_rss / (1024 * 1024))) .field("disk_mode", &self.disk_mode) .field("last_rss_mib", &(self.last_rss / (1024 * 1024))) .field("threshold_mib", &(self.threshold_bytes / (1024 * 1024))) diff --git a/src/output/mod.rs b/src/output/mod.rs index 39dfa15..bfbd195 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -133,8 +133,15 @@ pub struct OutputStats { pub pmkids_written: usize, /// Total *logical* EAPOL pairs that survived dedup at least once across all sinks. pub pairs_written: usize, - /// Total *logical* hashes suppressed by every configured sink's dedup. - pub dedup_dropped: usize, + /// Logical EAPOL pairs suppressed by every configured sink's dedup. + pub dedup_dropped_pairs: usize, + /// Logical PMKID hashes suppressed by every configured sink's dedup. + pub dedup_dropped_pmkids: usize, + /// Hashes dropped at emit because the AKM could not be mapped to one of the + /// 11 types (`HashType::from_akm_and_attack` returned None). + pub emit_dropped_unclassified_akm: usize, + /// FT hashes dropped at emit because the FT context (R0KH-ID) was missing. + pub emit_dropped_ft_no_context: usize, /// Per-sink line counts (passed each sink's dedup, written to disk if configured). pub lines_per_sink: PerSinkCounts, @@ -171,6 +178,12 @@ pub struct OutputStats { /// Largest single NC-dedup cluster observed. Equal to /// `NcDedupStats::max_cluster_size` across `pair_all_groups`. pub nc_dedup_max_cluster_size: u64, + /// Candidate pairs dropped by the `--eapoltimeout` filter, summed across all + /// groups (`NcDedupStats::time_filtered`). Zero in WIDE mode. + pub pairs_time_filtered: u64, + /// Candidate pairs dropped by the `--rc-drift` filter, summed across all + /// groups (`NcDedupStats::rc_filtered`). Zero in WIDE mode. + pub pairs_rc_filtered: u64, /// Maximum `rc_gap_magnitude` seen across all written pairs. pub rc_gap_max: u64, @@ -189,6 +202,22 @@ pub struct OutputStats { /// Distinct AP MACs that triggered at least one `essid_unresolved_emissions`. /// Lower bound on the number of "truly hidden" APs in the capture. pub essid_unresolved_aps: u64, + + // --- auxiliary sink entry counts (filled by `finalize`) --- + /// Entries written to the `-E` ESSID list. + pub entries_essid: usize, + /// Entries written to the `-R` probe-ESSID list. + pub entries_probe: usize, + /// Entries written to the `-W` combined wordlist. + pub entries_wordlist: usize, + /// Entries written to the `--wordlist-scan` IE-scan wordlist. + pub entries_wordlist_scan: usize, + /// Entries written to the `-I` identity list. + pub entries_identity: usize, + /// Entries written to the `-U` username list. + pub entries_username: usize, + /// Entries written to the `-D` device-info table. + pub entries_device: usize, } impl OutputStats { @@ -697,14 +726,19 @@ impl OutputContext { } else { entry.akm }; - let Some(ht) = HashType::from_akm_and_attack(resolved_akm, true) else { continue }; + let Some(ht) = HashType::from_akm_and_attack(resolved_akm, true) else { + stats.emit_dropped_unclassified_akm += 1; + continue; + }; let ssids = essid_map.ssids_for_emit(&entry.ap, essid_filter.collapse_min, essid_filter.collapse_ratio); let is_ft = ht.is_ft(); let ft_ctx: Option<&FtFields> = if is_ft { - match entry.ft.as_ref().filter(|ft| ft.r0khid_len > 0) { - Some(ft) => Some(ft), - None => continue, + if let Some(ft) = entry.ft.as_ref().filter(|ft| ft.r0khid_len > 0) { + Some(ft) + } else { + stats.emit_dropped_ft_no_context += 1; + continue; } } else { None @@ -723,7 +757,7 @@ impl OutputContext { stats.pmkids_written += 1; *stats.hash_type_emitted.entry(ht).or_insert(0) += 1; } else { - stats.dedup_dropped += 1; + stats.dedup_dropped_pmkids += 1; } } } @@ -735,7 +769,7 @@ impl OutputContext { // inside the streaming callback, then dropped -- peak memory is one // group's pairs at a time instead of the full cross-product. let pairs_written_before = stats.pairs_written; - let dedup_dropped_before = stats.dedup_dropped; + let dedup_dropped_before = stats.dedup_dropped_pairs; #[allow(clippy::items_after_statements, reason = "EmitState must be defined after Pipeline 1 borrows are used")] struct EmitState<'a> { @@ -767,15 +801,20 @@ impl OutputContext { let EmitState { sinks: s, dedup: d, disk_dedup: dd, stats: st, unresolved_drops: ud, first_error } = &mut *guard; for pair in &pairs { - let Some(ht) = HashType::from_akm_and_attack(pair.akm, false) else { continue }; + let Some(ht) = HashType::from_akm_and_attack(pair.akm, false) else { + st.emit_dropped_unclassified_akm += 1; + continue; + }; let ssids = essid_map.ssids_for_emit(&pair.ap, essid_filter.collapse_min, essid_filter.collapse_ratio); let is_ft = ht.is_ft(); let ft_ctx: Option<&FtFields> = if is_ft { - match pair.ft.as_ref().filter(|ft| ft.r0khid_len > 0) { - Some(ft) => Some(ft), - None => continue, + if let Some(ft) = pair.ft.as_ref().filter(|ft| ft.r0khid_len > 0) { + Some(ft) + } else { + st.emit_dropped_ft_no_context += 1; + continue; } } else { None @@ -813,7 +852,7 @@ impl OutputContext { } st.rc_gap_max = st.rc_gap_max.max(pair.rc_gap_magnitude); } else { - st.dedup_dropped += 1; + st.dedup_dropped_pairs += 1; } }, Err(e) => { @@ -847,13 +886,15 @@ impl OutputContext { es.stats.nc_dedup_collapsed_lines += nc_stats.collapsed_lines; es.stats.nc_dedup_cluster_count += nc_stats.cluster_count; es.stats.nc_dedup_max_cluster_size = es.stats.nc_dedup_max_cluster_size.max(nc_stats.max_cluster_size); + es.stats.pairs_time_filtered += nc_stats.time_filtered; + es.stats.pairs_rc_filtered += nc_stats.rc_filtered; let total_pairs = total_pairs_processed.load(std::sync::atomic::Ordering::Relaxed); debug.phase4_pairs_generated(total_pairs); debug.emit_fan_out_done( total_pairs, es.stats.pairs_written - pairs_written_before, - es.stats.dedup_dropped - dedup_dropped_before, + es.stats.dedup_dropped_pairs - dedup_dropped_before, es.stats.nc_dedup_collapsed_lines, es.stats.nc_dedup_cluster_count, ); @@ -917,37 +958,38 @@ impl OutputContext { if let Some(path) = &paths.essid_list { let mut f = BufWriter::with_capacity(OUTPUT_BUF_CAPACITY, std::fs::File::create(path)?); - write_essid_list(essid_set, &mut f)?; + self.stats.entries_essid = write_essid_list(essid_set, &mut f)?; f.flush()?; } if let Some(path) = &paths.probe_essid_list { let mut f = BufWriter::with_capacity(OUTPUT_BUF_CAPACITY, std::fs::File::create(path)?); - write_probe_essid_list(probe_essid_set, &mut f)?; + self.stats.entries_probe = write_probe_essid_list(probe_essid_set, &mut f)?; f.flush()?; } if let Some(path) = &paths.wordlist { let mut f = BufWriter::with_capacity(OUTPUT_BUF_CAPACITY, std::fs::File::create(path)?); - write_wordlist(wordlist_store, &mut f)?; + self.stats.entries_wordlist = write_wordlist(wordlist_store, &mut f)?; f.flush()?; } if let Some(path) = &paths.wordlist_scan { let mut f = BufWriter::with_capacity(OUTPUT_BUF_CAPACITY, std::fs::File::create(path)?); - write_wordlist_scan(scan_ies_store, essid_set, probe_essid_set, wordlist_store, &mut f)?; + self.stats.entries_wordlist_scan = + write_wordlist_scan(scan_ies_store, essid_set, probe_essid_set, wordlist_store, &mut f)?; f.flush()?; } if let Some(path) = &paths.identity_list { let mut f = BufWriter::with_capacity(OUTPUT_BUF_CAPACITY, std::fs::File::create(path)?); - write_identities(identity_set, &mut f)?; + self.stats.entries_identity = write_identities(identity_set, &mut f)?; f.flush()?; } if let Some(path) = &paths.username_list { let mut f = BufWriter::with_capacity(OUTPUT_BUF_CAPACITY, std::fs::File::create(path)?); - write_usernames(username_set, &mut f)?; + self.stats.entries_username = write_usernames(username_set, &mut f)?; f.flush()?; } if let Some(path) = &paths.device_info { let mut f = BufWriter::with_capacity(OUTPUT_BUF_CAPACITY, std::fs::File::create(path)?); - write_device_info(device_store, &mut f)?; + self.stats.entries_device = write_device_info(device_store, &mut f)?; f.flush()?; } diff --git a/src/pair/combos.rs b/src/pair/combos.rs index fe80059..1b24002 100644 --- a/src/pair/combos.rs +++ b/src/pair/combos.rs @@ -88,7 +88,12 @@ impl Default for PairConfig { /// at near-zero cost (fingerprint computation is ~20ns vs ~150ns for a full hash line). /// The output phase runs a final ESSID-aware dedup for correctness. #[must_use] -pub fn generate(ap: MacAddr, sta: MacAddr, messages: &[EapolMessage], config: &PairConfig) -> Vec { +pub fn generate( + ap: MacAddr, + sta: MacAddr, + messages: &[EapolMessage], + config: &PairConfig, +) -> (Vec, PairFilterStats) { use std::collections::HashSet; // Partition messages by type for O(n*m) pairing rather than O(n^2) over the full list. @@ -159,6 +164,18 @@ pub fn generate(ap: MacAddr, sta: MacAddr, messages: &[EapolMessage], config: &P let mut pairs: Vec = Vec::new(); let mut seen: HashSet = HashSet::new(); + let mut filter_stats = PairFilterStats::default(); + + // Records an opt-in-filter rejection from `try_pair` against the per-group + // tally. A no-op in WIDE mode (no `Err` is ever produced). + macro_rules! count_filtered { + ($reason:expr) => { + match $reason { + FilterReason::Time => filter_stats.time_filtered += 1, + FilterReason::Rc => filter_stats.rc_filtered += 1, + } + }; + } // Inline dedup helper: compute fingerprint and push only if new. // This uses the same fingerprint as output::dedup::eapol_fingerprint but with an @@ -216,8 +233,9 @@ pub fn generate(ap: MacAddr, sta: MacAddr, messages: &[EapolMessage], config: &P // N1E2: ANonce from M1, EAPOL frame from M2. [ARCHITECTURE.md §5] for nonce_msg in &m1s { for eapol_msg in &m2s { - if let Some(pair) = try_pair(ap, sta, nonce_msg, eapol_msg, ComboType::N1E2, config) { - dedup_push!(pair); + match try_pair(ap, sta, nonce_msg, eapol_msg, ComboType::N1E2, config) { + Ok(pair) => dedup_push!(pair), + Err(r) => count_filtered!(r), } } } @@ -225,8 +243,9 @@ pub fn generate(ap: MacAddr, sta: MacAddr, messages: &[EapolMessage], config: &P // N1E4: ANonce from M1, EAPOL frame from M4. Spans the whole session. [ARCHITECTURE.md §5] for nonce_msg in &m1s { for eapol_msg in &m4s { - if let Some(pair) = try_pair(ap, sta, nonce_msg, eapol_msg, ComboType::N1E4, config) { - dedup_push!(pair); + match try_pair(ap, sta, nonce_msg, eapol_msg, ComboType::N1E4, config) { + Ok(pair) => dedup_push!(pair), + Err(r) => count_filtered!(r), } } } @@ -240,11 +259,14 @@ pub fn generate(ap: MacAddr, sta: MacAddr, messages: &[EapolMessage], config: &P // unit tests below for one representative case per source. for nonce_msg in &m3s { for eapol_msg in &m2s { - if let Some(mut pair) = try_pair(ap, sta, nonce_msg, eapol_msg, ComboType::N3E2, config) { - if session_carries_nc || pair.rc_gap_magnitude > 0 { - pair.message_pair |= FLAG_NC; - } - dedup_push!(pair); + match try_pair(ap, sta, nonce_msg, eapol_msg, ComboType::N3E2, config) { + Ok(mut pair) => { + if session_carries_nc || pair.rc_gap_magnitude > 0 { + pair.message_pair |= FLAG_NC; + } + dedup_push!(pair); + }, + Err(r) => count_filtered!(r), } } } @@ -252,8 +274,9 @@ pub fn generate(ap: MacAddr, sta: MacAddr, messages: &[EapolMessage], config: &P // N2E3: SNonce from M2, EAPOL frame from M3. AP-less combo. [ARCHITECTURE.md §5] for nonce_msg in &m2s { for eapol_msg in &m3s { - if let Some(pair) = try_pair(ap, sta, nonce_msg, eapol_msg, ComboType::N2E3, config) { - dedup_push!(pair); + match try_pair(ap, sta, nonce_msg, eapol_msg, ComboType::N2E3, config) { + Ok(pair) => dedup_push!(pair), + Err(r) => count_filtered!(r), } } } @@ -261,8 +284,9 @@ pub fn generate(ap: MacAddr, sta: MacAddr, messages: &[EapolMessage], config: &P // N4E3: SNonce from M4, EAPOL frame from M3. AP-less combo. [ARCHITECTURE.md §5] for nonce_msg in &m4s { for eapol_msg in &m3s { - if let Some(pair) = try_pair(ap, sta, nonce_msg, eapol_msg, ComboType::N4E3, config) { - dedup_push!(pair); + match try_pair(ap, sta, nonce_msg, eapol_msg, ComboType::N4E3, config) { + Ok(pair) => dedup_push!(pair), + Err(r) => count_filtered!(r), } } } @@ -276,16 +300,19 @@ pub fn generate(ap: MacAddr, sta: MacAddr, messages: &[EapolMessage], config: &P // on standard handshakes where M3.rc = M4.rc exactly. for nonce_msg in &m3s { for eapol_msg in &m4s { - if let Some(mut pair) = try_pair(ap, sta, nonce_msg, eapol_msg, ComboType::N3E4, config) { - if session_carries_nc || pair.rc_gap_magnitude > 0 { - pair.message_pair |= FLAG_NC; - } - dedup_push!(pair); + match try_pair(ap, sta, nonce_msg, eapol_msg, ComboType::N3E4, config) { + Ok(mut pair) => { + if session_carries_nc || pair.rc_gap_magnitude > 0 { + pair.message_pair |= FLAG_NC; + } + dedup_push!(pair); + }, + Err(r) => count_filtered!(r), } } } - pairs + (pairs, filter_stats) } // --- try_pair --- @@ -303,11 +330,11 @@ fn try_pair( eapol_msg: &EapolMessage, combo: ComboType, config: &PairConfig, -) -> Option { +) -> Result { // Time constraint: both messages must fall within the configured EAPOL session window. // [ARCHITECTURE.md §8 FR-PAIR-3] if config.time_check_enabled && !within_time(nonce_msg.timestamp, eapol_msg.timestamp, config.eapol_timeout_us) { - return None; + return Err(FilterReason::Time); } // RC constraint (opt-in via --rc-drift): replay counters must be consistent with the @@ -315,7 +342,10 @@ fn try_pair( // pairs with the standard M3.rc = M2.rc + 1 delta are not spuriously rejected. // Unfiltered (rc_drift_enabled=false): all pairs treated as RC-exact. [ARCHITECTURE.md §8 FR-PAIR-4] let rc_rel = if config.rc_drift_enabled { - within_rc_for_combo(nonce_msg, eapol_msg, combo, config.rc_drift_tolerance)? + match within_rc_for_combo(nonce_msg, eapol_msg, combo, config.rc_drift_tolerance) { + Some(rel) => rel, + None => return Err(FilterReason::Rc), + } } else { RcRelation::Exact // unfiltered: no RC constraint, treat all pairs as exact }; @@ -360,7 +390,7 @@ fn try_pair( // Saturate on the impossible overflow to keep clippy happy. let rc_gap_magnitude = u64::try_from((nonce_rc - eapol_rc - expected_delta).unsigned_abs()).unwrap_or(u64::MAX); - Some(PairedHash { + Ok(PairedHash { ap, sta, combo_type: combo, @@ -374,6 +404,29 @@ fn try_pair( }) } +/// Why a candidate (nonce, EAPOL) pair was rejected by an opt-in output filter. +/// +/// Both variants only ever occur when the corresponding filter flag is set +/// (`--eapoltimeout` / `--rc-drift`); in WIDE mode `try_pair` never returns `Err`. +/// `generate` tallies these so the banner can show how many pairs a filter removed +/// rather than letting them vanish. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum FilterReason { + /// Dropped by the `--eapoltimeout` session-window constraint (FR-PAIR-3). + Time, + /// Dropped by the `--rc-drift` replay-counter constraint (FR-PAIR-4). + Rc, +} + +/// Per-group tally of pairs removed by the opt-in output filters. Zero in WIDE mode. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct PairFilterStats { + /// Candidate pairs dropped by the `--eapoltimeout` filter. + pub time_filtered: u64, + /// Candidate pairs dropped by the `--rc-drift` filter. + pub rc_filtered: u64, +} + // --- Nonce endianness detection --- /// Inspects M1 and M3 `ANonce` bytes to decide whether the AP is storing the nonce's @@ -592,7 +645,7 @@ mod tests { fn generate_n1e2_basic() { // M1 (RC=1, ts=0) paired with M2 (RC=1, ts=100) -> one N1E2 pair. let msgs = vec![make_msg(MsgType::M1, 1, 0, 0xAA), make_msg(MsgType::M2, 1, 100, 0xBB)]; - let pairs = generate(ap(), sta(), &msgs, &default_config()); + let (pairs, _) = generate(ap(), sta(), &msgs, &default_config()); assert_eq!(pairs.len(), 1); assert_eq!(pairs[0].combo_type, ComboType::N1E2); } @@ -608,8 +661,12 @@ mod tests { ..PairConfig::default() }; let msgs = vec![make_msg(MsgType::M1, 1, 0, 0xAA), make_msg(MsgType::M2, 1, 6_000_000, 0xBB)]; - let pairs = generate(ap(), sta(), &msgs, &config); + let (pairs, fs) = generate(ap(), sta(), &msgs, &config); assert!(pairs.is_empty()); + // The one N1E2 candidate was removed by the time filter and is counted, + // not vanished. RC filter is off, so its tally stays zero. + assert_eq!(fs.time_filtered, 1, "the time-filtered candidate must be tallied"); + assert_eq!(fs.rc_filtered, 0); } #[test] @@ -617,8 +674,11 @@ mod tests { // M1 RC=1, M2 RC=100 -> delta=99 > tolerance=8 -> no pairs when rc_drift is on. let config = PairConfig { rc_drift_enabled: true, rc_drift_tolerance: 8, ..PairConfig::default() }; let msgs = vec![make_msg(MsgType::M1, 1, 0, 0xAA), make_msg(MsgType::M2, 100, 100, 0xBB)]; - let pairs = generate(ap(), sta(), &msgs, &config); + let (pairs, fs) = generate(ap(), sta(), &msgs, &config); assert!(pairs.is_empty()); + // The N1E2 candidate was removed by the RC filter and is counted. + assert_eq!(fs.rc_filtered, 1, "the RC-filtered candidate must be tallied"); + assert_eq!(fs.time_filtered, 0); } #[test] @@ -626,7 +686,7 @@ mod tests { // M3 (RC=2, ts=200) paired with M2 (RC=2, ts=100) -> at least one N3E2 pair. // N2E3 also fires here (M2 as nonce, M3 as eapol, same RCs), so assert by filtering. let msgs = vec![make_msg(MsgType::M3, 2, 200, 0xAA), make_msg(MsgType::M2, 2, 100, 0xBB)]; - let pairs = generate(ap(), sta(), &msgs, &default_config()); + let (pairs, _) = generate(ap(), sta(), &msgs, &default_config()); assert_eq!(pairs.iter().filter(|p| p.combo_type == ComboType::N3E2).count(), 1); } @@ -644,7 +704,7 @@ mod tests { make_msg(MsgType::M2, 1, 100, 0xB1), make_msg(MsgType::M3, 2, 200, 0xC1), ]; - let pairs = generate(ap(), sta(), &msgs, &default_config()); + let (pairs, _) = generate(ap(), sta(), &msgs, &default_config()); let n3e2: Vec<&PairedHash> = pairs.iter().filter(|p| p.combo_type == ComboType::N3E2).collect(); assert_eq!(n3e2.len(), 1, "expected one N3E2 pair"); assert_ne!( @@ -662,7 +722,7 @@ mod tests { // be set even without M1 inheritance. Mirrors the mid-capture-start case // where the AP retransmitted M3 against an earlier M2. let msgs = vec![make_msg(MsgType::M2, 1, 100, 0xB1), make_msg(MsgType::M3, 1, 200, 0xC1)]; - let pairs = generate(ap(), sta(), &msgs, &default_config()); + let (pairs, _) = generate(ap(), sta(), &msgs, &default_config()); let n3e2: Vec<&PairedHash> = pairs.iter().filter(|p| p.combo_type == ComboType::N3E2).collect(); assert_eq!(n3e2.len(), 1, "expected one N3E2 pair"); assert_ne!( @@ -680,7 +740,7 @@ mod tests { // the line-by-line superset invariant against hcx-default. This test // pins the matching behaviour. let msgs = vec![make_msg(MsgType::M2, 1, 100, 0xB1), make_msg(MsgType::M3, 2, 200, 0xC1)]; - let pairs = generate(ap(), sta(), &msgs, &default_config()); + let (pairs, _) = generate(ap(), sta(), &msgs, &default_config()); let n3e2: Vec<&PairedHash> = pairs.iter().filter(|p| p.combo_type == ComboType::N3E2).collect(); assert_eq!(n3e2.len(), 1, "expected one N3E2 pair"); assert_eq!( @@ -709,7 +769,7 @@ mod tests { let m3_a = EapolMessage { nonce: n_a, ..make_msg(MsgType::M3, 2, 200, 0xC1) }; let m3_b = EapolMessage { nonce: n_b, ..make_msg(MsgType::M3, 2, 220, 0xC2) }; let msgs = vec![make_msg(MsgType::M2, 1, 100, 0xB1), m3_a, m3_b]; - let pairs = generate(ap(), sta(), &msgs, &default_config()); + let (pairs, _) = generate(ap(), sta(), &msgs, &default_config()); let n3e2: Vec<&PairedHash> = pairs.iter().filter(|p| p.combo_type == ComboType::N3E2).collect(); assert!(!n3e2.is_empty(), "expected at least one N3E2 pair"); for p in &n3e2 { @@ -727,7 +787,7 @@ mod tests { // M2 (RC=1, ts=100) paired with M3 (RC=2, ts=200) -> at least one N2E3 pair. // N3E2 also fires (M3 as nonce RC=2, M2 as eapol RC=1, delta=1 -> Exact), so filter. let msgs = vec![make_msg(MsgType::M2, 1, 100, 0xCC), make_msg(MsgType::M3, 2, 200, 0xDD)]; - let pairs = generate(ap(), sta(), &msgs, &default_config()); + let (pairs, _) = generate(ap(), sta(), &msgs, &default_config()); assert_eq!(pairs.iter().filter(|p| p.combo_type == ComboType::N2E3).count(), 1); } @@ -736,7 +796,7 @@ mod tests { // M1 (RC=1, ts=0) paired with M4 (RC=2, ts=300) -> one N1E4 pair (RC diff=1 <= 8). // No other combos fire with only M1 and M4 in the list. let msgs = vec![make_msg(MsgType::M1, 1, 0, 0xAA), make_msg(MsgType::M4, 2, 300, 0xBB)]; - let pairs = generate(ap(), sta(), &msgs, &default_config()); + let (pairs, _) = generate(ap(), sta(), &msgs, &default_config()); assert_eq!(pairs.len(), 1); assert_eq!(pairs[0].combo_type, ComboType::N1E4); } @@ -746,7 +806,7 @@ mod tests { // M4 (RC=2, ts=300) paired with M3 (RC=2, ts=200) -> at least one N4E3 pair. // N3E4 also fires (same M3 as nonce, M4 as eapol, same RCs), so filter. let msgs = vec![make_msg(MsgType::M4, 2, 300, 0xAA), make_msg(MsgType::M3, 2, 200, 0xBB)]; - let pairs = generate(ap(), sta(), &msgs, &default_config()); + let (pairs, _) = generate(ap(), sta(), &msgs, &default_config()); assert_eq!(pairs.iter().filter(|p| p.combo_type == ComboType::N4E3).count(), 1); } @@ -755,7 +815,7 @@ mod tests { // M3 (RC=2, ts=200) paired with M4 (RC=2, ts=300) -> at least one N3E4 pair. // N4E3 also fires (M4 as nonce, M3 as eapol, same RCs), so filter. let msgs = vec![make_msg(MsgType::M3, 2, 200, 0xAA), make_msg(MsgType::M4, 2, 300, 0xBB)]; - let pairs = generate(ap(), sta(), &msgs, &default_config()); + let (pairs, _) = generate(ap(), sta(), &msgs, &default_config()); assert_eq!(pairs.iter().filter(|p| p.combo_type == ComboType::N3E4).count(), 1); } @@ -768,7 +828,7 @@ mod tests { make_msg(MsgType::M2, 1, 50, 0xB1), make_msg(MsgType::M2, 1, 60, 0xB2), ]; - let pairs = generate(ap(), sta(), &msgs, &default_config()); + let (pairs, _) = generate(ap(), sta(), &msgs, &default_config()); // All four should be N1E2. assert_eq!(pairs.len(), 4); assert!(pairs.iter().all(|p| p.combo_type == ComboType::N1E2)); @@ -777,7 +837,7 @@ mod tests { #[test] fn generate_empty_messages() { // Empty slice -> no pairs. - let pairs = generate(ap(), sta(), &[], &default_config()); + let (pairs, _) = generate(ap(), sta(), &[], &default_config()); assert!(pairs.is_empty()); } @@ -798,7 +858,7 @@ mod tests { nc_dedup_enabled: false, nc_tolerance: 8, }; - let pairs = generate(ap(), sta(), &msgs, &tight); + let (pairs, _) = generate(ap(), sta(), &msgs, &tight); // With rc_drift active and tolerance=8, the pair should be found with NC set. assert_eq!(pairs.len(), 1); assert_ne!(pairs[0].message_pair & FLAG_NC, 0, "FLAG_NC must be set for within-tolerance RC"); @@ -808,7 +868,7 @@ mod tests { fn generate_combo_type_in_message_pair() { // N1E2 -> combo discriminant = 0, so message_pair & 0x07 == 0. let msgs = vec![make_msg(MsgType::M1, 1, 0, 0xAA), make_msg(MsgType::M2, 1, 100, 0xBB)]; - let pairs = generate(ap(), sta(), &msgs, &default_config()); + let (pairs, _) = generate(ap(), sta(), &msgs, &default_config()); assert_eq!(pairs.len(), 1); assert_eq!(pairs[0].message_pair & 0x07, ComboType::N1E2 as u8); } diff --git a/src/pair/mod.rs b/src/pair/mod.rs index d51b963..6533e38 100644 --- a/src/pair/mod.rs +++ b/src/pair/mod.rs @@ -108,11 +108,14 @@ use crate::pair::nc_dedup::{NcDedupStats, nc_dedup}; use crate::store::messages::{EapolMessage, MessageStore}; use crate::types::{MacPair, MsgType}; -/// Folds `other` into `acc` in place: `collapsed_lines` and `cluster_count` -/// sum component-wise; `max_cluster_size` takes the larger of the two. +/// Folds `other` into `acc` in place: `collapsed_lines`, `cluster_count`, and +/// the opt-in filter drops sum component-wise; `max_cluster_size` takes the +/// larger of the two. const fn merge_nc_stats(acc: &mut NcDedupStats, other: NcDedupStats) { acc.collapsed_lines += other.collapsed_lines; acc.cluster_count += other.cluster_count; + acc.time_filtered += other.time_filtered; + acc.rc_filtered += other.rc_filtered; if other.max_cluster_size > acc.max_cluster_size { acc.max_cluster_size = other.max_cluster_size; } @@ -148,9 +151,14 @@ fn pair_one_group( ) -> (Vec, NcDedupStats) { let mut sorted = messages.to_vec(); sorted.sort_unstable_by_key(|m| m.timestamp); - let pairs = generate(mac_pair.ap, mac_pair.sta, &sorted, config); + let (pairs, filter_stats) = generate(mac_pair.ap, mac_pair.sta, &sorted, config); let pairs = collapse(pairs, config.all_combos); - nc_dedup(pairs, config) + let (pairs, mut nc) = nc_dedup(pairs, config); + // Carry the generate-time filter drops on the same per-group struct that the + // streaming and disk merge paths already aggregate. + nc.time_filtered = filter_stats.time_filtered; + nc.rc_filtered = filter_stats.rc_filtered; + (pairs, nc) } /// Streaming pairing pipeline: pairs each group and delivers results via callback. diff --git a/src/pair/nc_dedup.rs b/src/pair/nc_dedup.rs index 8ccd90d..5e1d4d6 100644 --- a/src/pair/nc_dedup.rs +++ b/src/pair/nc_dedup.rs @@ -59,6 +59,13 @@ pub struct NcDedupStats { pub cluster_count: u64, /// Largest cluster size observed in this call. pub max_cluster_size: u64, + /// Candidate pairs dropped by the `--eapoltimeout` filter in `generate`. + /// Carried on this per-group stats struct so the filter drops ride the same + /// streaming and disk merge paths as the NC-dedup counts. Zero in WIDE mode. + pub time_filtered: u64, + /// Candidate pairs dropped by the `--rc-drift` filter in `generate`. Zero in + /// WIDE mode. + pub rc_filtered: u64, } /// Collapses near-identical-nonce siblings within `pairs`, tagging the survivor diff --git a/src/stats.rs b/src/stats.rs index 2631ea9..4d0adf5 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -67,6 +67,11 @@ pub struct Stats { /// reach the inner LLC/SNAP. Each increment corresponds to one MSDU recovered /// for downstream EAPOL/EAP processing. [IEEE 802.11-2024] §9.2.4.8.3 pub mesh_control_frames: u64, + /// Mesh Data frames dropped because the Mesh Control header could not be + /// skipped: the Address Extension Mode field was the reserved value `11`, or + /// the body was shorter than the header it claimed. [IEEE 802.11-2024] + /// §9.2.4.8.3. The inner MSDU (and any EAPOL it carried) is lost. + pub mesh_control_malformed: u64, /// EAP-Success frames (Code 3) seen in EAPOL EAP-Packet payloads. RFC 3748 §4.2. /// Stats-only; carries no identity data so it never affects hash extraction. A /// non-zero count alongside zero EAP-Failure indicates a successful enterprise @@ -137,12 +142,42 @@ pub struct Stats { /// inversions per file so a deeply-shuffled capture does not flood the /// log. pub out_of_sequence_timestamps: u64, - /// Hash lines written to output file(s). - pub hashes_written: u64, - /// Hash lines dropped by the dedup filter. + /// Packets whose capture timestamp was zero (capture-tool artifact; the + /// frame is still processed). Counterpart of `out_of_sequence_timestamps`. + pub packets_zeroed_timestamp: u64, + /// Packets dropped because their pcapng `interface_id` had no IDB-registered + /// DLT (a missing-interface reference); also logged via `[unknown_linktype]`. + /// [draft-ietf-opsawg-pcapng-05 §4.2] + pub packets_unknown_linktype: u64, + /// Frames dropped after a successful MAC-header parse because the header's + /// `body_offset` ran past the captured frame length (snaplen-truncated or + /// corrupt length). The header parsed but the body slice was unavailable. + pub truncated_after_header: u64, + /// Total bytes of capture files opened by the ingest loop (file sizes, not + /// decompressed payload). Feeds the Phase 5 throughput row. + pub bytes_ingested: u64, + /// Logical PMKID hashes written (post-dedup), from `OutputStats::pmkids_written`. + pub pmkids_written: u64, + /// Hash lines dropped by the dedup filter (pairs + PMKIDs). pub dedup_dropped: u64, + /// EAPOL-pair share of `dedup_dropped`. + pub dedup_dropped_pairs: u64, + /// PMKID share of `dedup_dropped`. + pub dedup_dropped_pmkids: u64, + /// Hashes dropped at emit because the AKM could not be mapped to one of the + /// 11 types (`HashType::from_akm_and_attack` returned None even after AKM-map + /// inference for PMKIDs). Crack material we extracted but cannot format. + pub emit_dropped_unclassified_akm: u64, + /// FT hashes (types 6/7/10/11) dropped at emit because the FT context was + /// incomplete (no R0KH-ID), so the `WPA*03*`/`WPA*04*` FT line could not be + /// built. [ARCHITECTURE.md §7 FT line format] + pub emit_dropped_ft_no_context: u64, /// Unique AP ESSIDs seen. pub essid_count: u64, + /// Largest number of SSID changes recorded for any single AP (distinct + /// SSID variants minus the initial one). Capture-quality signal: a high + /// value on one AP usually means RF-rotted duplicate beacons. + pub essid_changes_max: u64, // --- Per-subtype management frame counters --- // [IEEE 802.11-2024] §9.2.4.1.3, Table 9-1 @@ -150,6 +185,12 @@ pub struct Stats { pub beacon_frames: u64, /// Probe Response frames (subtype 5). pub probe_resp_frames: u64, + /// Probe Responses whose SSID IE was zero-length (unset). A Probe Response + /// answering a directed probe should carry the SSID; an empty one is a + /// capture-quality signal. Mirrors hcxpcapngtool's `PROBERESPONSE (SSID unset)`. + pub probe_resp_ssid_unset: u64, + /// Probe Responses whose SSID IE bytes were all `0x00` (zeroed). + pub probe_resp_ssid_zeroed: u64, /// Probe Request frames -- directed (unicast DA, subtype 4). pub probe_req_directed: u64, /// Probe Request frames -- undirected (broadcast DA, subtype 4). @@ -166,12 +207,20 @@ pub struct Stats { pub auth_frames: u64, /// Deauthentication frames (subtype 12). pub deauth_frames: u64, + /// Deauthentication frames carrying Reason Code 14 ("Message integrity + /// code (MIC) failure"). [IEEE 802.11-2024] §9.4.1.7 Table 9-90. The + /// canonical "this handshake will never pair cleanly" signal. + pub mic_failure_deauths: u64, /// Disassociation frames (subtype 10). pub disassoc_frames: u64, /// Action frames (subtype 13). pub action_frames: u64, /// Action No Ack frames (subtype 14). pub action_no_ack_frames: u64, + /// Management frames whose subtype is reserved (7, 15) per Table 9-1 -- counted + /// in `mgmt_frames` but not in any named subtype counter. Diagnostic; lets the + /// management subtype children reconcile against the management total. + pub mgmt_reserved_subtype: u64, /// ATIM frames (subtype 9). pub atim_frames: u64, /// Measurement Pilot frames (subtype 6). @@ -258,6 +307,11 @@ pub struct Stats { pub band_5ghz: u64, /// Packets with channel frequency in the 6 GHz band (5925-7125 MHz). pub band_6ghz: u64, + /// Packets carrying a radiotap Channel frequency that fell outside the three + /// known Wi-Fi bands above (sub-GHz 802.11ah, 60 GHz DMG, or a corrupt field). + /// Informational only -- the frame is processed normally; this just makes the + /// per-band split account for every channel-bearing packet. + pub band_other: u64, // --- Beacon channel distribution (from DS Parameter Set IE, tag 3) --- // Populated only for Beacon frames. Key = channel number (1 byte from the IE). @@ -344,6 +398,16 @@ pub struct Stats { /// N3E4 pairs written (`ANonce` from M3, EAPOL from M4 -- authorized). pub pairs_written_n3e4: u64, + // --- Pairing output-filter drops (opt-in; zero in WIDE mode) --- + /// Candidate pairs discarded by the `--eapoltimeout` session-window filter + /// (the two messages fell more than the window apart). Off by default; only + /// nonzero when `--eapoltimeout` / `--strict` is set. [ARCHITECTURE.md §8 FR-PAIR-3] + pub pairs_time_filtered: u64, + /// Candidate pairs discarded by the `--rc-drift` replay-counter filter (the + /// per-combo RC relationship exceeded the tolerance). Off by default; only + /// nonzero when `--rc-drift` / `--strict` is set. [ARCHITECTURE.md §8 FR-PAIR-4] + pub pairs_rc_filtered: u64, + // --- RC / NC / endianness stats --- /// Maximum actual RC gap magnitude seen across useful pairs written to output. pub rc_gap_max: u64, @@ -398,6 +462,16 @@ pub struct Stats { /// dispatch through `process_auth_pasn` and increment this single /// counter. pub auth_pasn: u64, + /// Authentication responses with Status Code 52 ("R0KH unreachable"). + /// [IEEE 802.11-2024] §9.4.1.9 Table 9-92. Each one explains a missing + /// FT-PSK handshake: the AP refused the FT authentication. + pub ft_status_r0kh_unreachable: u64, + /// Authentication responses with Status Code 53 ("Invalid PMKID"). Table 9-92. + pub ft_status_invalid_pmkid: u64, + /// Authentication responses with Status Code 54 ("Invalid MDE"). Table 9-92. + pub ft_status_invalid_mde: u64, + /// Authentication responses with Status Code 55 ("Invalid FTE"). Table 9-92. + pub ft_status_invalid_fte: u64, // EAPOL descriptor type breakdown. /// EAPOL-Key frames with RSN descriptor type (0x02). [IEEE 802.11-2024] §12.7.2 @@ -411,7 +485,11 @@ pub struct Stats { pub eapol_kdv2: u64, /// EAPOL-Key frames with Key Descriptor Version 3 (AES-128-CMAC; PSK-SHA256 / FT-PSK). pub eapol_kdv3: u64, - /// EAPOL-Key frames with Key Descriptor Version 0 or 4-7 (reserved / non-standard). + /// EAPOL-Key frames with Key Descriptor Version 0 ("AKM-defined"). Spec-legitimate + /// for the SHA-384 AKM families (19/20) and other post-KDV suites, NOT an anomaly. + /// [IEEE 802.11-2024] §12.7.2 Table 12-11. + pub eapol_kdv0: u64, + /// EAPOL-Key frames with Key Descriptor Version 4-7 (reserved / non-standard). pub eapol_kdv_other: u64, /// EAPOL frames rejected because the Key Nonce was all-NULL (`0x00...00`). Applies to /// every message type including M4. M4 NULL nonce is spec-valid on the wire per @@ -484,9 +562,13 @@ pub struct Stats { /// Diagnostic only -- output correctness is unaffected. [ARCHITECTURE.md §4] pub anonce_m1_m3_mismatch_sessions: u64, // WPA/WEP encrypted data frame counts. - /// Data frames with the Protected Frame bit set (WPA/WEP encrypted payload). - /// [IEEE 802.11-2024] §9.2.4.1.1 bit B14 + /// Data frames with the Protected Frame bit set whose `KeyID` octet carries + /// ExtIV=1 (TKIP/CCMP/GCMP -- the WPA family), or whose body is too short + /// to expose the `KeyID` octet. [IEEE 802.11-2024] §9.2.4.1.1 bit B14, §12.5.2.2. pub wpa_encrypted_data: u64, + /// Data frames with the Protected Frame bit set and ExtIV=0 in the `KeyID` + /// octet (legacy WEP encapsulation). [IEEE 802.11-2024] §12.3.4.2. + pub wep_encrypted_data: u64, /// Management frames with the Protected Frame bit set (PMF / 802.11w). /// [IEEE 802.11-2024] §11.13. Covers Disassoc, Deauth, and Robust Action frames @@ -586,12 +668,13 @@ pub struct Stats { pub mle_basic_seen: u64, /// Distinct link -> MLD MAC mappings learned from MLE bodies. pub mle_mld_addrs_learned: u64, - /// `(AP, STA)` groups merged during MLD canonicalization in `message_store`. + /// `(AP, STA)` groups that received an additional MLD-keyed copy during MLD + /// canonicalization in `message_store` (the original link-keyed group is kept). pub mld_groups_merged: u64, - /// Link-MAC SSID entries folded into the MLD canonical key during - /// `essid_map` canonicalization. Each merged entry was an AP advertising - /// under a band link MAC whose SSID would otherwise have been unreachable - /// via the MLD-keyed pair lookup at output time. + /// Link-MAC SSID entries that received an MLD-keyed copy during `essid_map` + /// canonicalization (the original link entry is kept). The copy lets a + /// handshake emitted under the MLD key resolve its SSID; the original lets a + /// single-link handshake emitted under the link key resolve its SSID. pub essid_link_macs_merged: u64, /// Hash lines suppressed because no Beacon, Probe Response, `AssocReq` / /// `ReassocReq`, directed Probe Request, nor MLD link-MAC fallback yielded @@ -678,7 +761,7 @@ pub struct Stats { // and were written to the configured file. `dropped_` is the count of lines // suppressed by that sink's dedup. A single logical hash fans out to up to three // sinks (legacy + per-AKM-family + combined), so the per-sink counters do not sum - // to `hashes_written`. See `ARCHITECTURE.md §7`. + // to the Phase 5 logical hash total. See `ARCHITECTURE.md §7`. /// `--22000-out` lines written. pub lines_22000: u64, /// `--37100-out` lines written. @@ -748,6 +831,8 @@ pub struct Stats { pub username_list_path: String, /// Path for device info output, or empty when -D was not given. pub device_info_path: String, + /// Path for the `--wordlist-scan` IE-scan output, or empty when not given. + pub wordlist_scan_path: String, // --- Per-hash-type breakdown (the 11-row per-AKM) --- // Counts the number of hash lines emitted for each row of the table in @@ -758,6 +843,44 @@ pub struct Stats { /// Hash lines written, keyed by `HashType`. pub hash_type_emitted: HashMap, + // --- Extraction-side identity tallies --- + /// Unique EAP identity strings extracted (RFC 3748 §5.1). Printed in Phase 3 + /// even when the `-I` sink is not configured. + pub identities_extracted: u64, + /// Unique EAP peer-identity (username) strings extracted. Printed in Phase 3 + /// even when the `-U` sink is not configured. + pub usernames_extracted: u64, + + // --- Auxiliary sink entry counts (lines actually written by `finalize`) --- + /// Entries written to the `-E` ESSID list. + pub entries_essid_list: u64, + /// Entries written to the `-R` probe-ESSID list. + pub entries_probe_list: u64, + /// Entries written to the `-W` combined wordlist. + pub entries_wordlist: u64, + /// Entries written to the `--wordlist-scan` IE-scan wordlist. + pub entries_wordlist_scan: u64, + /// Entries written to the `-I` identity list. + pub entries_identity_list: u64, + /// Entries written to the `-U` username list. + pub entries_username_list: u64, + /// Entries written to the `-D` device-info table. + pub entries_device_info: u64, + + // --- Phase 4 run context + Phase 5 cost block --- + /// Echo of the resolved output-filter state ("none (WIDE mode)" or the + /// active flag list), so a WIDE run and a --strict run are distinguishable + /// from the banner alone. + pub filters_active: String, + /// Wallclock of the Phase 1-3 streaming pass in milliseconds. + pub wallclock_p13_ms: u64, + /// Wallclock of the Phase 4 pairing + emit pass in milliseconds. + pub wallclock_p4_ms: u64, + /// Peak RSS sample in MiB (lower bound; sampled at the `MemMonitor` cadence). + pub peak_rss_mib: u64, + /// True when the disk-backed fallback engaged at any point during the run. + pub disk_mode_engaged: bool, + // --- Scratch / derived state (not printed directly) --- /// Per-(AP,STA) timestamp of the most recently stored EAPOL message. /// Accumulated during Phase 1 to compute `eapol_time_gap_max_us`. Not printed. @@ -844,11 +967,14 @@ impl Stats { /// Records an EAPOL-Key frame's Key Descriptor Version into the appropriate counter. /// /// Called once per stored EAPOL-Key message. KDV 1 = HMAC-MD5 (WPA legacy), - /// KDV 2 = HMAC-SHA1 (WPA2-PSK), KDV 3 = AES-CMAC (PSK-SHA256 / FT-PSK); - /// all other values are reserved and counted under `eapol_kdv_other`. - /// [IEEE 802.11-2024] §12.7.2, Key Information bits 0-2. + /// KDV 2 = HMAC-SHA1 (WPA2-PSK), KDV 3 = AES-CMAC (PSK-SHA256 / FT-PSK), + /// KDV 0 = "AKM-defined" (spec-legitimate for the SHA-384 AKM families, + /// counted separately so the banner does not flag it as an anomaly); + /// 4-7 are reserved and counted under `eapol_kdv_other`. + /// [IEEE 802.11-2024] §12.7.2, Key Information bits 0-2, Table 12-11. pub const fn record_key_descriptor_version(&mut self, key_version: u8) { match key_version { + 0 => self.eapol_kdv0 += 1, 1 => self.eapol_kdv1 += 1, 2 => self.eapol_kdv2 += 1, 3 => self.eapol_kdv3 += 1, @@ -906,33 +1032,72 @@ impl Stats { entries.iter().map(|(k, n)| format!("{k} ({n})")).collect::>().join(", ") } - /// Prints the four-section summary to stderr. + /// Sum of the eight terminal per-packet dispositions from Phase 1 + 2. /// - /// Sections: - /// 1. General -- capture file metadata and all 802.11 frame-type counts. - /// 2. EAPOL -- message counts, classification, pair analysis. - /// 3. PMKID -- per-source and per-AKM breakdown of all extracted PMKIDs. - /// 4. Output -- what was available per hashcat mode and what was written. + /// Every packet counted in `total_packets` follows the main loop to exactly + /// one of these outcomes: dropped for an unknown link type, dropped when the + /// link strip failed all recovery tiers, counted as a control frame, dropped + /// as a malformed MAC header, dropped when truncated past the header, counted + /// as an extension frame, or handed to extraction as a management or data + /// frame. This therefore must equal `total_packets` -- STATS.md reconciliation + /// identity 1. The banner surfaces any discrepancy as a "packets unaccounted" + /// / "frames multi-counted" BUG row, and `run()` debug-asserts the equality so + /// a future silent `continue` cannot pass the test suite. + #[must_use] + pub const fn packets_accounted(&self) -> u64 { + self.packets_unknown_linktype + + self.link_errors + + self.ctrl_frames + + self.malformed_mac_hdr + + self.truncated_after_header + + self.extension_frames + + self.mgmt_frames + + self.data_frames + } + + /// Prints the five-section closing banner to stdout. /// - /// Called unconditionally at the end of every run. + /// Called unconditionally at the end of every run. The banner layout is the + /// contract documented in `ARCHITECTURE.md §9`: one section per pipeline + /// phase, dotted-leader rows at a fixed value column, `nz!` suppression for + /// quiet runs, and a drop/recovered/diagnostic/informational suffix on every + /// issue row. pub fn print_summary(&self) { + print!("{}", self.summary_string()); + } + + /// Renders the closing banner as a string. + /// + /// Separated from [`Self::print_summary`] so tests can assert on the rendered + /// rows (label width, row presence, parent/child sums) without capturing stdout. + #[must_use] + #[allow(clippy::too_many_lines, reason = "linear banner layout in pipeline order; splitting hurts auditability")] + pub fn summary_string(&self) -> String { // W: dot-padding width for the label column. Longest label is 45 chars; - // W must exceed that so every row gets at least several dots before ": ". - const W: usize = 52; + // W must exceed every label so each row gets at least two dots before ": ". + const W: usize = 60; + // Hard cap on label width (W minus the two-dot minimum). Enforced by the + // debug_assert in the row macros and the `banner_labels_fit_column` test. + const LABEL_MAX: usize = W - 2; // Section header total width (header text + fill dashes). - const SW: usize = 62; + const SW: usize = 70; // EAPOL header overhead shown in auth-length display (matches hcxpcapngtool). const EAPAUTH_SIZE: u16 = 4; + use std::fmt::Write as _; + let mut out = String::with_capacity(8 * 1024); + macro_rules! stat { - ($label:expr, $val:expr) => { - println!("{:. {{ + let label = $label; + debug_assert!(label.chars().count() <= LABEL_MAX, "banner label exceeds {LABEL_MAX} chars: {label}"); + let _ = writeln!(out, "{:. { if $val > 0 { - println!("{:. {{ let hdr = format!("=== Phase {} -- {} ", $num, $name); let fill = "=".repeat(SW.saturating_sub(hdr.len()).max(4)); - println!("{hdr}{fill}"); + let _ = writeln!(out, "{hdr}{fill}"); }}; } - println!("---"); - println!("wpawolf {} ({})", env!("CARGO_PKG_VERSION"), env!("GIT_HASH")); - println!("---"); + let _ = writeln!(out, "---"); + let _ = writeln!(out, "wpawolf {} ({})", env!("CARGO_PKG_VERSION"), env!("GIT_HASH")); + let _ = writeln!(out, "---"); // ====================================================================== // Phase 1 -- Ingest: file metadata + raw packet/byte ingestion @@ -992,14 +1157,18 @@ impl Stats { let dur_s = self.timestamp_last_us.saturating_sub(self.timestamp_first_us) / 1_000_000; stat!("duration (s)", dur_s); } + nz!("bytes ingested (MiB)", self.bytes_ingested / (1024 * 1024)); stat!("packets total", self.total_packets); nz!("link/parse errors (frames dropped)", self.link_errors); nz!(" MAC header malformed (frame dropped)", self.malformed_mac_hdr); - nz!("frames with non-zero Protocol Version (forgiven; processed)", self.lenient_proto_version); - nz!("capture files with truncated trailing record (earlier records kept)", self.truncated_capture_files); + nz!("non-zero Protocol Version (forgiven; processed)", self.lenient_proto_version); + nz!("files with truncated trailing record (earlier kept)", self.truncated_capture_files); nz!(" trailing packets unread (dropped; see --log)", self.unreadable_packets); nz!("input files skipped (magic unrecognised; see --log)", self.files_skipped_unknown_format); - nz!("pcap timestamps out of sequence (capture-tool artifact; informational)", self.out_of_sequence_timestamps); + nz!("packets dropped (unknown link type; no IDB)", self.packets_unknown_linktype); + nz!("packets dropped (truncated past MAC header)", self.truncated_after_header); + nz!("packets with zeroed timestamps (informational)", self.packets_zeroed_timestamp); + nz!("timestamps out of sequence (informational)", self.out_of_sequence_timestamps); // ====================================================================== // Phase 2 -- Decode: link/802.11 frame classification, per-band @@ -1011,10 +1180,19 @@ impl Stats { stat!("data frames", self.data_frames); stat!("control frames", self.ctrl_frames); nz!("extension frames (802.11 amendments)", self.extension_frames); + // Packet-accounting self-check (STATS.md identity 1). By this point all + // eight terminal per-packet dispositions are final, so the two rows below + // are 0 on every correct run -- they only ever appear if a future change + // drops a packet without a counter (unaccounted) or counts one twice + // (multi-counted). `run()` debug-asserts the same equality. + let accounted = self.packets_accounted(); + nz!("packets unaccounted (BUG; report this)", self.total_packets.saturating_sub(accounted)); + nz!("frames multi-counted (BUG; report this)", accounted.saturating_sub(self.total_packets)); nz!("relay (WDS) frames", self.relay_frames); nz!("WPA encrypted data frames", self.wpa_encrypted_data); + nz!("WEP encrypted data frames", self.wep_encrypted_data); nz!("PMF-encrypted management frames (802.11w)", self.mgmt_protected_frames); - nz!(" Action body dropped (PMF-encrypted; FT/Mesh PMKIDs unavailable)", self.mgmt_protected_action_skipped); + nz!(" Action body dropped (PMF; FT/Mesh PMKIDs unavailable)", self.mgmt_protected_action_skipped); nz!("A-MSDU aggregated Data frames (802.11n)", self.amsdu_frames_seen); nz!(" subframes recovered for hidden EAPOL", self.amsdu_subframes_total); nz!("radiotap it_version != 0 (Tier 1 recovered)", self.radiotap_version_nonzero); @@ -1023,19 +1201,18 @@ impl Stats { nz!("FCS stripped (header + CRC-32 agree)", self.fcs_header_and_crc_agree); nz!("FCS stripped (CRC-32 detected, header silent)", self.fcs_detected_by_crc); nz!("FCS stripped (BADFCS flagged; corrupt on air)", self.fcs_badfcs_flagged); - nz!("FCS stripped (header announced, CRC-32 mismatch, no BADFCS)", self.fcs_crc_mismatch_no_flag); + nz!("FCS stripped (CRC-32 mismatch; no BADFCS flag)", self.fcs_crc_mismatch_no_flag); + nz!("no FCS present (frame left untouched)", self.fcs_neither); nz!("radiotap A-MPDU Status field present (it_present bit 20)", self.ampdu_status_frames); nz!("fragments buffered for reassembly", self.fragment_stats.fragments_seen); nz!(" reassembled MSDUs (all fragments present)", self.fragment_stats.fragments_reassembled); nz!(" incomplete MSDUs (missing fragments in capture)", self.fragment_stats.fragments_incomplete); - nz!( - " fragments evicted (safety cap; paranoid backstop, expect 0)", - self.fragment_stats.fragments_dropped_safety_cap - ); + nz!(" fragments evicted (safety cap; expect 0)", self.fragment_stats.fragments_dropped_safety_cap); nz!("AWDL frames (Apple AWDL)", self.awdl_frames); nz!("on 2.4 GHz band (from radiotap)", self.band_24ghz); nz!("on 5 GHz band (from radiotap)", self.band_5ghz); nz!("on 6 GHz band (from radiotap)", self.band_6ghz); + nz!("on other/unknown band (from radiotap)", self.band_other); // Beacon channel distribution from DS Parameter Set IE (tag 3). [§9.4.2.4] let (ch24_str, ch56_str) = self.format_beacon_channels(); if let Some(s) = ch24_str { @@ -1045,10 +1222,13 @@ impl Stats { stat!("beacon channels 5/6 GHz (DS Parameter Set)", s); } // EAPOL Key Descriptor Version mix is a decode-time classification. + // KDV 0 is split from the reserved bucket: it is the spec-legitimate + // "AKM-defined" value for the SHA-384 families, not an anomaly. nz!("EAPOL KDV 1 (HMAC-MD5 / ARC4; WPA legacy)", self.eapol_kdv1); nz!("EAPOL KDV 2 (HMAC-SHA1 / AES; WPA2-PSK)", self.eapol_kdv2); nz!("EAPOL KDV 3 (AES-CMAC; PSK-SHA256 / FT-PSK)", self.eapol_kdv3); - nz!("EAPOL KDV other (reserved / non-standard)", self.eapol_kdv_other); + nz!("EAPOL KDV 0 (AKM-defined; SHA-384 families)", self.eapol_kdv0); + nz!("EAPOL KDV reserved (4-7; non-standard)", self.eapol_kdv_other); nz!("EAPOL RSN descriptor", self.eapol_rsn); nz!("EAPOL WPA (legacy) descriptor", self.eapol_wpa); @@ -1071,9 +1251,11 @@ impl Stats { nz!(" 6 GHz co-located BSSIDs (RNR)", self.rnr_6ghz_colocated); nz!(" Multi-Link Elements observed (11be)", self.mle_basic_seen); nz!(" MLD addresses learned", self.mle_mld_addrs_learned); - nz!(" (AP,STA) groups merged via MLD", self.mld_groups_merged); - nz!(" SSID link-MAC entries merged via MLD", self.essid_link_macs_merged); + nz!(" (AP,STA) groups also keyed under MLD (link kept)", self.mld_groups_merged); + nz!(" SSID entries also keyed under MLD (link kept)", self.essid_link_macs_merged); nz!("PROBE RESPONSE (total)", self.probe_resp_frames); + nz!(" SSID unset (probe response retained)", self.probe_resp_ssid_unset); + nz!(" SSID zeroed (probe response retained)", self.probe_resp_ssid_zeroed); nz!("PROBE REQUEST (undirected)", self.probe_req_undirected); nz!("PROBE REQUEST (directed)", self.probe_req_directed); nz!("ASSOCIATION REQUEST (total)", self.assoc_req_frames); @@ -1116,11 +1298,16 @@ impl Stats { nz!(" OPEN SYSTEM", self.auth_open_system); nz!(" SHARED KEY (WEP)", self.auth_shared_key); nz!(" FAST BSS TRANSITION", self.auth_fbt); + nz!(" FT status 52 R0KH unreachable (diagnostic)", self.ft_status_r0kh_unreachable); + nz!(" FT status 53 invalid PMKID (diagnostic)", self.ft_status_invalid_pmkid); + nz!(" FT status 54 invalid MDE (diagnostic)", self.ft_status_invalid_mde); + nz!(" FT status 55 invalid FTE (diagnostic)", self.ft_status_invalid_fte); nz!(" SAE (WPA3)", self.auth_sae); nz!(" FILS", self.auth_fils); nz!(" NETWORK EAP (Cisco LEAP)", self.auth_network_eap); nz!(" PASN (unknown algo)", self.auth_pasn); nz!("DEAUTHENTICATION (total)", self.deauth_frames); + nz!(" MIC failure, reason 14 (handshake-quality signal)", self.mic_failure_deauths); nz!("DISASSOCIATION (total)", self.disassoc_frames); nz!("ACTION (total)", self.action_frames); nz!(" NR REQUEST (containing ESSID)", self.action_nr_req_ssids); @@ -1138,11 +1325,13 @@ impl Stats { nz!("ATIM (total)", self.atim_frames); nz!("MEASUREMENT PILOT (total)", self.measurement_pilot_frames); nz!("TIMING ADVERTISEMENT (total)", self.timing_advert_frames); + nz!("RESERVED subtype (7/15; counted, not extracted)", self.mgmt_reserved_subtype); // Auxiliary extracted metadata. stat!("ESSID (unique APs seen)", self.essid_count); + nz!(" ESSID changes (per-AP maximum)", self.essid_changes_max); nz!(" hash lines dropped (no SSID resolved; not crackable)", self.essid_unresolved_emissions); - nz!(" distinct APs dropped (see [essid_not_found_summary] in --log)", self.essid_unresolved_aps); + nz!(" distinct APs affected (detail in --log)", self.essid_unresolved_aps); nz!("SSID List IE entries extracted", self.ssid_list_entries); nz!("Country codes extracted", self.country_codes_extracted); nz!("Mesh IDs extracted", self.mesh_ids_extracted); @@ -1154,7 +1343,9 @@ impl Stats { nz!("Multiple BSSID profiles extracted", self.multiple_bssid_profiles); nz!("RNR BSSIDs extracted", self.rnr_bssids_extracted); nz!("P2P device names extracted", self.p2p_device_names_extracted); - nz!("Wordlist IE-scan runs inserted (--wordlist-scan-ies)", self.wordlist_scan_ie_runs); + nz!("Wordlist IE-scan runs inserted (--wordlist-scan)", self.wordlist_scan_ie_runs); + nz!("EAP identities extracted", self.identities_extracted); + nz!("EAP usernames extracted", self.usernames_extracted); // EAPOL message counts and validity rejects. let eapol_total = self.eapol_m1 + self.eapol_m2 + self.eapol_m3 + self.eapol_m4; @@ -1187,56 +1378,53 @@ impl Stats { ); } stat!(" M4 messages", self.eapol_m4); + // Garbage-pattern rejections. The M4-vs-rest split prints only its + // non-zero sides: the M4 row is the spec-zero expected case (matches + // hcxpcapngtool's eapolm4zeroedcount), the M1/M2/M3 row is the abnormal + // case worth a closer look. Zero sides are suppressed. nz!(" NULL nonce rejected (frame dropped)", self.null_nonce_rejected); if self.null_nonce_rejected > 0 { - // Split the operator-visible NULL nonce count into the spec-zero - // M4 case (expected, harmless; matches hcxpcapngtool's - // eapolm4zeroedcount) and the abnormal M1 / M2 / M3 case (entropy - // starvation, firmware bug, capture tampering -- worth a closer - // look). - stat!(" on M4 (spec-zero per §12.7.6.5; expected)", self.null_nonce_rejected_on_m4); - stat!( - " on M1 / M2 / M3 (abnormal -- entropy starvation or firmware bug)", + nz!(" on M4 (spec-zero per §12.7.6.5; expected)", self.null_nonce_rejected_on_m4); + nz!( + " on M1 / M2 / M3 (abnormal; firmware or entropy bug)", self.null_nonce_rejected - self.null_nonce_rejected_on_m4 ); } nz!(" 0xFF nonce rejected (frame dropped)", self.ff_nonce_rejected); if self.ff_nonce_rejected > 0 { - stat!(" on M4", self.ff_nonce_rejected_on_m4); - stat!(" on M1 / M2 / M3", self.ff_nonce_rejected - self.ff_nonce_rejected_on_m4); + nz!(" on M4", self.ff_nonce_rejected_on_m4); + nz!(" on M1 / M2 / M3", self.ff_nonce_rejected - self.ff_nonce_rejected_on_m4); } - nz!(" garbage-pattern nonce rejected (repeating period; frame dropped)", self.repeat_nonce_rejected); + nz!(" repeating-pattern nonce rejected (frame dropped)", self.repeat_nonce_rejected); if self.repeat_nonce_rejected > 0 { - stat!(" on M4", self.repeat_nonce_rejected_on_m4); - stat!(" on M1 / M2 / M3", self.repeat_nonce_rejected - self.repeat_nonce_rejected_on_m4); + nz!(" on M4", self.repeat_nonce_rejected_on_m4); + nz!(" on M1 / M2 / M3", self.repeat_nonce_rejected - self.repeat_nonce_rejected_on_m4); } nz!(" NULL MIC rejected (frame dropped; M2/M3/M4)", self.null_mic_rejected); nz!(" 0xFF MIC rejected (frame dropped; M2/M3/M4)", self.ff_mic_rejected); - nz!(" garbage-pattern MIC rejected (repeating period; frame dropped)", self.repeat_mic_rejected); + nz!(" repeating-pattern MIC rejected (M2/M3/M4; dropped)", self.repeat_mic_rejected); nz!(" NULL PMKID rejected (placeholder; PMKID dropped)", self.null_pmkid_rejected); nz!(" 0xFF PMKID rejected (PMKID dropped)", self.ff_pmkid_rejected); - nz!(" garbage-pattern PMKID rejected (repeating period; PMKID dropped)", self.repeat_pmkid_rejected); - nz!( - " ESSID control bytes seen (0x00-0x1F in body; informational, SSID shipped unchanged)", - self.essid_control_bytes_warned - ); - if self.eapol_time_gap_max_us > 0 { + nz!(" repeating-pattern PMKID rejected (PMKID dropped)", self.repeat_pmkid_rejected); + nz!(" ESSID control bytes (informational; shipped unchanged)", self.essid_control_bytes_warned); + // Sub-millisecond gaps print in microseconds instead of a misleading 0 ms. + if self.eapol_time_gap_max_us >= 1_000 { stat!(" session time gap max (ms)", self.eapol_time_gap_max_us / 1_000); + } else { + nz!(" session time gap max (us)", self.eapol_time_gap_max_us); } - nz!( - " ANonce M1/M3 mismatch sessions (diagnostic; both anchors emitted; spec §12.7.6.4)", - self.anonce_m1_m3_mismatch_sessions - ); + nz!(" ANonce M1/M3 mismatch sessions (diagnostic; §12.7.6.4)", self.anonce_m1_m3_mismatch_sessions); // EAPOL direction classification (WDS tier breakdown). nz!("EAPOL classified by direction (Tier 1)", self.eapol_tier1_direction); nz!(" WDS via essid_map (Tier 1b; recovered)", self.eapol_tier1b_essid); nz!(" WDS via ACK discovery (Tier 2; recovered)", self.eapol_tier2_ack_discovery); nz!(" WDS flag-based fallback (Tier 3; recovered)", self.eapol_tier3_flag_fallback); - nz!(" direction/ACK mismatches (diagnostic; frame still paired)", self.eapol_ack_mismatches); + nz!(" direction/ACK mismatches (diagnostic; still paired)", self.eapol_ack_mismatches); nz!(" preauthentication frames (EtherType 0x88C7)", self.eapol_preauth_frames); nz!(" LLC accepted but EAPOL parse rejected (frame dropped)", self.eapol_llc_invalid); - nz!(" Mesh Data frames recovered (Mesh Control header unwrapped)", self.mesh_control_frames); + nz!(" Mesh Data frames recovered (Mesh Control unwrapped)", self.mesh_control_frames); + nz!(" Mesh Data dropped (bad Mesh Control header)", self.mesh_control_malformed); nz!(" EAP-Success frames (RFC 3748 §4.2)", self.eap_success_frames); nz!(" EAP-Failure frames (RFC 3748 §4.2)", self.eap_failure_frames); @@ -1258,6 +1446,12 @@ impl Stats { nz!(" Probe Response RSN IE (S17, vendor deviation)", self.pmkid_probe_resp); nz!(" Mesh Peering AMPE (S18/S19)", self.pmkid_mesh); nz!(" OSEN IE (S20, Hotspot 2.0)", self.pmkid_osen); + // Second child dimension of the same insertion total: by AKM family. + // Per ARCHITECTURE.md §2: non-FT = WPA2-PSK / PSK-SHA256 / PSK-SHA384; + // FT = FT-PSK / FT-PSK-SHA384. Labels avoid an embedded ": " so the + // first ": " on any banner row is always the label/value separator. + nz!(" by AKM family (non-FT PSK/SHA256/SHA384)", self.pmkid_wpa2_psk); + nz!(" by AKM family (FT-PSK/FT-PSK-SHA384)", self.pmkid_ft_psk); // ====================================================================== // Phase 4 -- Emit: hashes written, files produced, dedup decisions. @@ -1265,12 +1459,18 @@ impl Stats { // ====================================================================== section!(4, "Emit"); + // Run context: which output filters were active. A WIDE run and a + // --strict run must be distinguishable from the banner alone. + if !self.filters_active.is_empty() { + stat!("output filters active", self.filters_active); + } + // Per-hash-type breakdown -- one row per `HashType` variant from the // 11-type classification in ARCHITECTURE.md §2. Anchors every emitted hash // line to a single (AKM, attack surface) so the operator can read off // exactly what hashcat will see, type code by type code. if self.hash_type_emitted.values().any(|&n| n > 0) { - println!("per-hash-type lines emitted (per ARCHITECTURE.md §2):"); + let _ = writeln!(out, "per-hash-type lines emitted (per ARCHITECTURE.md §2):"); for ht in HashType::all() { let n = self.hash_type_emitted.get(&ht).copied().unwrap_or(0); if n > 0 { @@ -1280,20 +1480,29 @@ impl Stats { } } - // Pairing engine results (Phase 4 first half: pair/ -> output/). - stat!("EAPOL pairs generated (total)", self.eapol_pairs_generated); + // Pairing engine results (Phase 4 first half: pair/ -> output/). The + // generated total is pre-dedup; the written total and its combo children + // are post-dedup, so the children sum to the written row, not the + // generated one. + stat!("EAPOL pairs generated (total, pre-dedup)", self.eapol_pairs_generated); + stat!("EAPOL pairs written (post-dedup)", self.eapol_pairs_useful); nz!(" N1E2 challenge (ANonce from M1, EAPOL from M2)", self.pairs_written_n1e2); nz!(" N3E2 authorized (ANonce from M3, EAPOL from M2)", self.pairs_written_n3e2); nz!(" N1E4 authorized (ANonce from M1, EAPOL from M4)", self.pairs_written_n1e4); nz!(" N2E3 authorized (SNonce from M2, EAPOL from M3, AP-less)", self.pairs_written_n2e3); nz!(" N4E3 authorized (SNonce from M4, EAPOL from M3, AP-less)", self.pairs_written_n4e3); nz!(" N3E4 authorized (ANonce from M3, EAPOL from M4)", self.pairs_written_n3e4); - nz!(" NC flag set on pair (nonce-error-correction hint passed to hashcat)", self.pairs_nc); - nz!(" LE endianness flag set on pair (LE-router hint passed to hashcat)", self.pairs_le); - nz!(" BE endianness flag set on pair (BE-router hint passed to hashcat)", self.pairs_be); - nz!(" NC-dedup near-identical-nonce lines collapsed (--nc-dedup)", self.nc_dedup_collapsed_lines); + nz!(" NC flag set (nonce-error-correction hint for hashcat)", self.pairs_nc); + nz!(" LE endianness flag set (LE-router hint for hashcat)", self.pairs_le); + nz!(" BE endianness flag set (BE-router hint for hashcat)", self.pairs_be); + nz!(" NC-dedup lines collapsed (--nc-dedup)", self.nc_dedup_collapsed_lines); nz!(" NC-dedup cluster count (--nc-dedup)", self.nc_dedup_cluster_count); nz!(" NC-dedup max cluster size (--nc-dedup)", self.nc_dedup_max_cluster_size); + // Opt-in output-filter drops (zero in WIDE mode). These reduce the + // candidate set BEFORE the generated total above, so they are reported + // as their own lines rather than folded into the generated/written gap. + nz!(" candidates dropped (--eapoltimeout filter)", self.pairs_time_filtered); + nz!(" candidates dropped (--rc-drift filter)", self.pairs_rc_filtered); if self.rc_drift_enabled && self.rc_gap_max > 0 { // Firmware bugs and replay-counter corruption in wild captures produce // values like 2^56. Cap the display at 2^32 so the "suggested threshold" @@ -1307,20 +1516,29 @@ impl Stats { } } - // PMKIDs found by AKM family (extraction-time tally, before dedup and - // before the type-1-vs-type-2 routing). The actual emitted-line counts - // appear in the per-hash-type breakdown above and the per-sink counters - // below; this row just shows how many raw PMKIDs each AKM family - // contributed. Per ARCHITECTURE.md §2: non-FT family = WPA2-PSK / - // PSK-SHA256 / PSK-SHA384; FT family = FT-PSK / FT-PSK-SHA384. - nz!("PMKIDs found by AKM family (non-FT: WPA2-PSK/SHA256/SHA384)", self.pmkid_wpa2_psk); - nz!("PMKIDs found by AKM family (FT: FT-PSK/FT-PSK-SHA384)", self.pmkid_ft_psk); - - // Per-sink hash output rows. Each configured sink shows its file path and - // the line / dedup-dropped counters; unconfigured sinks show "not configured" - // and skip the counter rows. The legacy 22000 / 37100 sinks remain hashcat- - // compatible via the 4-prefix scheme; the per-AKM-family and combined sinks - // emit the 11-type classification prefixes from `ARCHITECTURE.md §2`. + // PMKID emission (post-dedup logical count). The extraction-time totals + // and the per-AKM-family split live in Phase 3 under "PMKID store + // insertions"; this is what survived dedup at least once. + stat!("PMKIDs written (post-dedup)", self.pmkids_written); + + // Global dedup accounting. Total plus the per-kind children, so the + // pre-dedup totals above reconcile: pairs generated = pairs written + + // EAPOL pair duplicates; PMKID insertions >= PMKIDs written + PMKID + // duplicates (insertions also shed garbage-pattern and unresolved-SSID + // drops counted in Phase 3). + nz!("dedup dropped (total; duplicate hashes not written)", self.dedup_dropped); + nz!(" EAPOL pair duplicates", self.dedup_dropped_pairs); + nz!(" PMKID duplicates", self.dedup_dropped_pmkids); + // Emit-time drops of crack material we extracted but could not format. + nz!("hashes dropped (unclassified AKM; no 11-type)", self.emit_dropped_unclassified_akm); + nz!("hashes dropped (FT context missing; no R0KH-ID)", self.emit_dropped_ft_no_context); + + // Per-sink hash output rows. Only configured sinks render (decision: + // banner space goes to what the run actually produced); the trailing + // one-liner counts the rest so the full sink surface stays discoverable. + // The legacy 22000 / 37100 sinks remain hashcat-compatible via the + // 4-prefix scheme; the per-AKM-family and combined sinks emit the + // 11-type classification prefixes from `ARCHITECTURE.md §2`. let sinks: [(&str, &str, u64, u64); 9] = [ ("--22000-out (legacy mode 22000)", &self.path_22000, self.lines_22000, self.dropped_22000), ("--37100-out (legacy mode 37100)", &self.path_37100, self.lines_37100, self.dropped_37100), @@ -1337,28 +1555,40 @@ impl Stats { self.dropped_ft_psk_sha384, ), ]; + let mut hash_sinks_unconfigured = 0u64; for (label, path, lines, dropped) in sinks { - let display = if path.is_empty() { "not configured" } else { path }; - stat!(label, display); - if !path.is_empty() { + if path.is_empty() { + hash_sinks_unconfigured += 1; + } else { + stat!(label, path); stat!(" lines written", lines); nz!(" dedup dropped (duplicate hashes; not written)", dropped); } } - - // Auxiliary output files (Phase 4 tail: wordlists, identities, device info). - let path_essid = if self.essid_list_path.is_empty() { "not configured" } else { &self.essid_list_path }; - let path_probe = if self.probe_list_path.is_empty() { "not configured" } else { &self.probe_list_path }; - let path_wl = if self.wordlist_path.is_empty() { "not configured" } else { &self.wordlist_path }; - let path_id = if self.identity_list_path.is_empty() { "not configured" } else { &self.identity_list_path }; - let path_un = if self.username_list_path.is_empty() { "not configured" } else { &self.username_list_path }; - let path_di = if self.device_info_path.is_empty() { "not configured" } else { &self.device_info_path }; - stat!("ESSID list (-E)", path_essid); - stat!("probe ESSID list (-R)", path_probe); - stat!("wordlist (-W)", path_wl); - stat!("identity list (-I)", path_id); - stat!("username list (-U)", path_un); - stat!("device info (-D)", path_di); + nz!("hash sinks not configured", hash_sinks_unconfigured); + + // Auxiliary output files (Phase 4 tail). Same configured-only rule, and + // each configured sink reports the entries it actually wrote -- parity + // with the hash sinks' "lines written" rows. + let aux_sinks: [(&str, &str, u64); 7] = [ + ("ESSID list (-E)", &self.essid_list_path, self.entries_essid_list), + ("probe ESSID list (-R)", &self.probe_list_path, self.entries_probe_list), + ("wordlist (-W)", &self.wordlist_path, self.entries_wordlist), + ("IE-scan wordlist (--wordlist-scan)", &self.wordlist_scan_path, self.entries_wordlist_scan), + ("identity list (-I)", &self.identity_list_path, self.entries_identity_list), + ("username list (-U)", &self.username_list_path, self.entries_username_list), + ("device info (-D)", &self.device_info_path, self.entries_device_info), + ]; + let mut aux_sinks_unconfigured = 0u64; + for (label, path, entries) in aux_sinks { + if path.is_empty() { + aux_sinks_unconfigured += 1; + } else { + stat!(label, path); + stat!(" entries written", entries); + } + } + nz!("auxiliary sinks not configured", aux_sinks_unconfigured); // ====================================================================== // Phase 5 -- Report: closing one-liner. (See ARCHITECTURE.md §3.5.) @@ -1366,15 +1596,66 @@ impl Stats { section!(5, "Report"); // Total hashes = sum of per-`HashType` counts (counted once per logical hash - // regardless of how many sinks it fanned out to). Distinct types observed = number - // of `HashType` rows whose counter is non-zero. + // regardless of how many sinks it fanned out to). The EAPOL/PMKID children + // come from the same table -- odd type codes are EAPOL attacks, even codes + // are PMKID attacks per the ARCHITECTURE.md §2 encoding rule -- so the two + // children always sum to the total. Distinct types observed = number of + // `HashType` rows whose counter is non-zero. let total_hashes: u64 = HashType::all().map(|ht| self.hash_type_emitted.get(&ht).copied().unwrap_or(0)).sum(); + let eapol_lines: u64 = HashType::all() + .filter(|ht| ht.type_code() % 2 == 1) + .map(|ht| self.hash_type_emitted.get(&ht).copied().unwrap_or(0)) + .sum(); + let pmkid_lines: u64 = total_hashes - eapol_lines; let active_types = HashType::all().filter(|ht| self.hash_type_emitted.get(ht).copied().unwrap_or(0) > 0).count(); stat!("hashes emitted (total)", total_hashes); + nz!(" EAPOL hash lines", eapol_lines); + nz!(" PMKID hash lines", pmkid_lines); stat!("distinct hash types observed", active_types); - println!("---"); + // Run cost. Wallclock is split at the Phase 3 / Phase 4 boundary (the + // streaming pass vs the pairing + emit pass); throughput is file bytes + // over the streaming pass. One decimal place via integer math -- no + // float casts under the cast-lint policy. + let fmt_s = |ms: u64| format!("{}.{}", ms / 1000, (ms % 1000) / 100); + if self.wallclock_p13_ms > 0 || self.wallclock_p4_ms > 0 { + stat!("wallclock Phase 1-3 streaming pass (s)", fmt_s(self.wallclock_p13_ms)); + stat!("wallclock Phase 4 emit (s)", fmt_s(self.wallclock_p4_ms)); + stat!("wallclock total (s)", fmt_s(self.wallclock_p13_ms + self.wallclock_p4_ms)); + } + if self.bytes_ingested > 0 && self.wallclock_p13_ms > 0 { + // tenths of MiB/s = bytes * 10_000 / (1 MiB * ms). Saturating guards + // the petabyte-scale corner instead of wrapping. + let tenths = self.bytes_ingested.saturating_mul(10_000) + / (1_048_576u64.saturating_mul(self.wallclock_p13_ms).max(1)); + stat!("throughput (MiB/s)", format!("{}.{}", tenths / 10, tenths % 10)); + } + nz!("peak RSS (MiB)", self.peak_rss_mib); + stat!("disk-backed fallback engaged", if self.disk_mode_engaged { "yes" } else { "no" }); + + // Zero-hash hint: one line naming the single largest drop counter so the + // operator knows where to look first. No advice paragraphs (terse banner + // is a feature); the named counter's own row carries the detail. + if total_hashes == 0 && self.total_packets > 0 { + let candidates: [(&str, u64); 6] = [ + ("NULL nonce rejected", self.null_nonce_rejected), + ("LLC accepted but EAPOL parse rejected", self.eapol_llc_invalid), + ("hash lines dropped (no SSID resolved)", self.essid_unresolved_emissions), + ("link/parse errors", self.link_errors), + ("MAC header malformed", self.malformed_mac_hdr), + ("input files skipped", self.files_skipped_unknown_format), + ]; + let largest = candidates.iter().max_by_key(|(_, n)| *n).filter(|(_, n)| *n > 0); + if let Some((label, n)) = largest { + stat!("hint (no hashes)", format!("largest drop is \"{label}\" ({n})")); + } else { + stat!("hint (no hashes)", "no EAPOL or PMKID material found in capture"); + } + } + + let _ = writeln!(out, "---"); + out } } @@ -1424,8 +1705,10 @@ mod tests { assert_eq!(s.unreadable_packets, 0); assert_eq!(s.files_skipped_unknown_format, 0); assert_eq!(s.out_of_sequence_timestamps, 0); - assert_eq!(s.hashes_written, 0); + assert_eq!(s.pmkids_written, 0); assert_eq!(s.dedup_dropped, 0); + assert_eq!(s.dedup_dropped_pairs, 0); + assert_eq!(s.dedup_dropped_pmkids, 0); assert_eq!(s.essid_count, 0); assert_eq!(s.beacon_frames, 0); assert_eq!(s.probe_resp_frames, 0); @@ -1516,17 +1799,19 @@ mod tests { s.record_key_descriptor_version(3); s.record_key_descriptor_version(3); s.record_key_descriptor_version(3); - s.record_key_descriptor_version(0); // reserved -> other + s.record_key_descriptor_version(0); // AKM-defined (SHA-384 families) -> kdv0 s.record_key_descriptor_version(7); // reserved -> other + assert_eq!(s.eapol_kdv0, 1); assert_eq!(s.eapol_kdv1, 2); assert_eq!(s.eapol_kdv2, 1); assert_eq!(s.eapol_kdv3, 3); - assert_eq!(s.eapol_kdv_other, 2); + assert_eq!(s.eapol_kdv_other, 1); } #[test] fn record_kdv_starts_at_zero() { let s = Stats::new(); + assert_eq!(s.eapol_kdv0, 0); assert_eq!(s.eapol_kdv1, 0); assert_eq!(s.eapol_kdv2, 0); assert_eq!(s.eapol_kdv3, 0); @@ -1635,8 +1920,9 @@ mod tests { s.m4_auth_len_max = 95; s.pmkids_found = 5; s.essid_count = 3; - s.hashes_written = 14; + s.pmkids_written = 4; s.dedup_dropped = 2; + s.dedup_dropped_pairs = 2; s.link_errors = 1; s.truncated_capture_files = 2; s.unreadable_packets = 2; @@ -1674,4 +1960,355 @@ mod tests { s.last_file = "/captures/last.pcap".to_owned(); s.print_summary(); } + + /// Lights up every banner row, then asserts each rendered row keeps at + /// least two leader dots before the value column -- i.e. every label fits + /// the W=60 contract from `ARCHITECTURE.md §9`. A new row whose label + /// exceeds the cap fails here (and trips the `debug_assert` in the row + /// macros) instead of silently breaking the column alignment. + #[test] + fn banner_labels_fit_column() { + let mut s = Stats::new(); + // Phase 1. + s.input_file_count = 2; + *s.file_formats_seen.entry("pcap 2.4".to_owned()).or_insert(0) += 2; + *s.endians_seen.entry("little endian".to_owned()).or_insert(0) += 2; + *s.dlt_descs_seen.entry("DLT_IEEE802_11_RADIO (127)".to_owned()).or_insert(0) += 2; + s.last_file = "b.pcap".to_owned(); + s.timestamp_first_us = 1_000_000; + s.timestamp_last_us = 2_000_000; + s.bytes_ingested = 10 * 1024 * 1024; + s.total_packets = 100; + s.link_errors = 1; + s.malformed_mac_hdr = 1; + s.lenient_proto_version = 1; + s.truncated_capture_files = 1; + s.unreadable_packets = 1; + s.files_skipped_unknown_format = 1; + s.packets_unknown_linktype = 1; + s.truncated_after_header = 1; + s.packets_zeroed_timestamp = 1; + s.out_of_sequence_timestamps = 1; + // Phase 2. + s.mgmt_frames = 50; + s.data_frames = 40; + s.ctrl_frames = 10; + s.extension_frames = 1; + s.relay_frames = 1; + s.wpa_encrypted_data = 1; + s.wep_encrypted_data = 1; + s.mgmt_protected_frames = 1; + s.mgmt_protected_action_skipped = 1; + s.amsdu_frames_seen = 1; + s.amsdu_subframes_total = 1; + s.radiotap_version_nonzero = 1; + s.recovered_tier2 = 1; + s.recovered_tier3 = 1; + s.fcs_header_and_crc_agree = 1; + s.fcs_detected_by_crc = 1; + s.fcs_badfcs_flagged = 1; + s.fcs_crc_mismatch_no_flag = 1; + s.fcs_neither = 1; + s.ampdu_status_frames = 1; + s.fragment_stats.fragments_seen = 1; + s.fragment_stats.fragments_reassembled = 1; + s.fragment_stats.fragments_incomplete = 1; + s.fragment_stats.fragments_dropped_safety_cap = 1; + s.awdl_frames = 1; + s.band_24ghz = 1; + s.band_5ghz = 1; + s.band_6ghz = 1; + s.band_other = 1; + *s.beacon_channels.entry(6).or_insert(0) += 1; + *s.beacon_channels.entry(36).or_insert(0) += 1; + s.eapol_kdv0 = 1; + s.eapol_kdv1 = 1; + s.eapol_kdv2 = 1; + s.eapol_kdv3 = 1; + s.eapol_kdv_other = 1; + s.eapol_rsn = 1; + s.eapol_wpa = 1; + // Phase 3: management subtype tree. + s.beacon_frames = 1; + s.beacon_ssid_wildcard = 1; + s.beacon_ssid_zeroed = 1; + s.beacon_ssid_oversized = 1; + s.rsnxe_sae_h2e = 1; + s.rsnxe_sae_pk = 1; + s.rsnxe_secure_ltf = 1; + s.rsnxe_protected_twt = 1; + s.rnr_blocks_parsed = 1; + s.rnr_6ghz_colocated = 1; + s.mle_basic_seen = 1; + s.mle_mld_addrs_learned = 1; + s.mld_groups_merged = 1; + s.essid_link_macs_merged = 1; + s.probe_resp_frames = 1; + s.probe_resp_ssid_unset = 1; + s.probe_resp_ssid_zeroed = 1; + s.probe_req_undirected = 1; + s.probe_req_directed = 1; + s.assoc_req_frames = 1; + s.assoc_req_wpa1 = 1; + s.assoc_req_wpa2_psk = 1; + s.assoc_req_ft_psk = 1; + s.assoc_req_ft_psk_sha384 = 1; + s.assoc_req_psk_sha256 = 1; + s.assoc_req_psk_sha384 = 1; + s.assoc_req_sae = 1; + s.assoc_req_owe = 1; + s.assoc_req_fils = 1; + s.assoc_req_pasn = 1; + s.assoc_req_enterprise_sha1 = 1; + s.assoc_req_enterprise_sha256 = 1; + s.assoc_req_enterprise_sha384 = 1; + s.assoc_req_tdls = 1; + s.assoc_req_appeerkey = 1; + s.assoc_req_akm_unknown = 1; + s.assoc_resp_frames = 1; + s.reassoc_req_frames = 1; + s.reassoc_req_wpa1 = 1; + s.reassoc_req_wpa2_psk = 1; + s.reassoc_req_ft_psk = 1; + s.reassoc_req_ft_psk_sha384 = 1; + s.reassoc_req_psk_sha256 = 1; + s.reassoc_req_psk_sha384 = 1; + s.reassoc_req_sae = 1; + s.reassoc_req_owe = 1; + s.reassoc_req_fils = 1; + s.reassoc_req_pasn = 1; + s.reassoc_req_enterprise_sha1 = 1; + s.reassoc_req_enterprise_sha256 = 1; + s.reassoc_req_enterprise_sha384 = 1; + s.reassoc_req_tdls = 1; + s.reassoc_req_appeerkey = 1; + s.reassoc_req_akm_unknown = 1; + s.reassoc_resp_frames = 1; + s.auth_frames = 1; + s.auth_open_system = 1; + s.auth_shared_key = 1; + s.auth_fbt = 1; + s.ft_status_r0kh_unreachable = 1; + s.ft_status_invalid_pmkid = 1; + s.ft_status_invalid_mde = 1; + s.ft_status_invalid_fte = 1; + s.auth_sae = 1; + s.auth_fils = 1; + s.auth_network_eap = 1; + s.auth_pasn = 1; + s.deauth_frames = 1; + s.mic_failure_deauths = 1; + s.disassoc_frames = 1; + s.action_frames = 1; + s.action_nr_req_ssids = 1; + s.fils_discovery_ssids = 1; + s.action_ft_frames = 1; + s.action_mesh_peering = 1; + s.anqp_gas_frames = 1; + s.anqp_venue_name = 1; + s.anqp_domain_name = 1; + s.anqp_nai_realm = 1; + s.anqp_hs_operator_friendly_name = 1; + s.anqp_unknown_info_id = 1; + s.anqp_fragmented_skipped = 1; + s.action_no_ack_frames = 1; + s.atim_frames = 1; + s.measurement_pilot_frames = 1; + s.timing_advert_frames = 1; + s.mgmt_reserved_subtype = 1; + // Phase 3: ESSID + plaintext surfaces. + s.essid_count = 1; + s.essid_changes_max = 1; + s.essid_unresolved_emissions = 1; + s.essid_unresolved_aps = 1; + s.ssid_list_entries = 1; + s.country_codes_extracted = 1; + s.mesh_ids_extracted = 1; + s.wps_probe_req_extracted = 1; + s.vendor_ap_names_extracted = 1; + s.owe_transition_ssids = 1; + s.ccx1_ap_names_extracted = 1; + s.time_zones_extracted = 1; + s.multiple_bssid_profiles = 1; + s.rnr_bssids_extracted = 1; + s.p2p_device_names_extracted = 1; + s.wordlist_scan_ie_runs = 1; + s.identities_extracted = 1; + s.usernames_extracted = 1; + // Phase 3: EAPOL block. + s.eapol_m1 = 1; + s.eapol_m2 = 1; + s.eapol_m3 = 1; + s.eapol_m4 = 1; + s.m1_auth_len_max = 95; + s.m2_auth_len_max = 121; + s.m3_auth_len_max = 151; + s.m4_auth_len_max = 95; + s.null_nonce_rejected = 2; + s.null_nonce_rejected_on_m4 = 1; + s.ff_nonce_rejected = 2; + s.ff_nonce_rejected_on_m4 = 1; + s.repeat_nonce_rejected = 2; + s.repeat_nonce_rejected_on_m4 = 1; + s.null_mic_rejected = 1; + s.ff_mic_rejected = 1; + s.repeat_mic_rejected = 1; + s.null_pmkid_rejected = 1; + s.ff_pmkid_rejected = 1; + s.repeat_pmkid_rejected = 1; + s.essid_control_bytes_warned = 1; + s.eapol_time_gap_max_us = 1_500_000; + s.anonce_m1_m3_mismatch_sessions = 1; + s.eapol_tier1_direction = 1; + s.eapol_tier1b_essid = 1; + s.eapol_tier2_ack_discovery = 1; + s.eapol_tier3_flag_fallback = 1; + s.eapol_ack_mismatches = 1; + s.eapol_preauth_frames = 1; + s.eapol_llc_invalid = 1; + s.mesh_control_frames = 1; + s.mesh_control_malformed = 1; + s.eap_success_frames = 1; + s.eap_failure_frames = 1; + // Phase 3: PMKID sources. + s.pmkids_found = 13; + s.pmkid_m1 = 1; + s.pmkid_m2 = 1; + s.pmkid_assoc_req = 1; + s.pmkid_reassoc_req = 1; + s.pmkid_ft_auth = 1; + s.pmkid_fils_auth = 1; + s.pmkid_pasn_auth = 1; + s.pmkid_ft_action = 1; + s.pmkid_probe_req = 1; + s.pmkid_beacon = 1; + s.pmkid_probe_resp = 1; + s.pmkid_mesh = 1; + s.pmkid_osen = 1; + s.pmkid_wpa2_psk = 1; + s.pmkid_ft_psk = 1; + // Phase 4. + s.filters_active = "eapoltimeout=5, rc-drift=8, dedup-hash-combos, nc-dedup (tolerance 8)".to_owned(); + for ht in HashType::all() { + *s.hash_type_emitted.entry(ht).or_insert(0) += 1; + } + s.eapol_pairs_generated = 8; + s.eapol_pairs_useful = 6; + s.pairs_written_n1e2 = 1; + s.pairs_written_n3e2 = 1; + s.pairs_written_n1e4 = 1; + s.pairs_written_n2e3 = 1; + s.pairs_written_n4e3 = 1; + s.pairs_written_n3e4 = 1; + s.pairs_nc = 1; + s.pairs_le = 1; + s.pairs_be = 1; + s.pairs_time_filtered = 1; + s.pairs_rc_filtered = 1; + s.nc_dedup_collapsed_lines = 1; + s.nc_dedup_cluster_count = 1; + s.nc_dedup_max_cluster_size = 1; + s.rc_drift_enabled = true; + s.rc_gap_max = 3; + s.pmkids_written = 5; + s.dedup_dropped = 3; + s.dedup_dropped_pairs = 2; + s.dedup_dropped_pmkids = 1; + s.emit_dropped_unclassified_akm = 1; + s.emit_dropped_ft_no_context = 1; + s.path_22000 = "h.22000".to_owned(); + s.lines_22000 = 1; + s.dropped_22000 = 1; + s.path_37100 = "h.37100".to_owned(); + s.lines_37100 = 1; + s.path_combined = "h.all".to_owned(); + s.lines_combined = 1; + s.path_wpa1 = "h.wpa1".to_owned(); + s.path_wpa2 = "h.wpa2".to_owned(); + s.path_psk_sha256 = "h.s256".to_owned(); + s.path_ft = "h.ft".to_owned(); + s.path_psk_sha384 = "h.s384".to_owned(); + s.path_ft_psk_sha384 = "h.fts384".to_owned(); + s.essid_list_path = "essids.txt".to_owned(); + s.entries_essid_list = 1; + s.probe_list_path = "probes.txt".to_owned(); + s.entries_probe_list = 1; + s.wordlist_path = "wl.txt".to_owned(); + s.entries_wordlist = 1; + s.identity_list_path = "ids.txt".to_owned(); + s.entries_identity_list = 1; + s.username_list_path = "users.txt".to_owned(); + s.entries_username_list = 1; + s.device_info_path = "devs.tsv".to_owned(); + s.entries_device_info = 1; + s.wordlist_scan_path = "scan.txt".to_owned(); + s.entries_wordlist_scan = 1; + // Phase 5. + s.wallclock_p13_ms = 1_500; + s.wallclock_p4_ms = 200; + s.peak_rss_mib = 10; + s.disk_mode_engaged = true; + + let rendered = s.summary_string(); + let mut rows_checked = 0usize; + for line in rendered.lines() { + if line.starts_with("===") || line == "---" || line.starts_with("wpawolf ") || line.ends_with(':') { + continue; + } + let Some(idx) = line.find(": ") else { continue }; + let label_part = &line[..idx]; + assert!(label_part.ends_with(".."), "banner row label too wide (no dot leader): {line}"); + rows_checked += 1; + } + // Guard against the test silently checking nothing if the layout changes. + assert!(rows_checked > 150, "expected the lit-up banner to render >150 rows, got {rows_checked}"); + } + + /// `packets_accounted` sums exactly the eight terminal dispositions, so a + /// consistent run reconciles to `total_packets` (identity 1) and neither BUG + /// row renders; an inconsistent run surfaces the discrepancy. + #[test] + fn packet_accounting_identity_reconciles_and_surfaces_breaks() { + let mut s = Stats::new(); + s.total_packets = 12; + s.mgmt_frames = 4; + s.data_frames = 3; + s.ctrl_frames = 2; + s.extension_frames = 1; + s.link_errors = 1; + s.malformed_mac_hdr = 0; + s.truncated_after_header = 1; + s.packets_unknown_linktype = 0; + assert_eq!(s.packets_accounted(), 12, "the eight terminal buckets must sum to total"); + let rendered = s.summary_string(); + assert!(!rendered.contains("unaccounted (BUG"), "consistent run must not show the BUG row:\n{rendered}"); + assert!(!rendered.contains("multi-counted (BUG"), "{rendered}"); + + // Drop a packet without a counter -> unaccounted surfaces. + s.total_packets = 13; + assert_eq!(s.total_packets - s.packets_accounted(), 1); + assert!(s.summary_string().contains("packets unaccounted (BUG; report this)")); + + // Count one twice -> multi-counted surfaces. + s.total_packets = 11; + assert!(s.summary_string().contains("frames multi-counted (BUG; report this)")); + } + + /// Zero-hash runs print a one-line hint naming the largest drop counter. + #[test] + fn zero_hash_hint_names_largest_drop() { + let mut s = Stats::new(); + s.total_packets = 10; + s.null_nonce_rejected = 7; + s.filters_active = "none (WIDE mode)".to_owned(); + let rendered = s.summary_string(); + assert!(rendered.contains("hint (no hashes)"), "missing hint row:\n{rendered}"); + assert!(rendered.contains("NULL nonce rejected"), "hint must name the largest drop:\n{rendered}"); + + // And when nothing was dropped either, the hint says so explicitly. + let mut empty = Stats::new(); + empty.total_packets = 10; + let rendered_empty = empty.summary_string(); + assert!(rendered_empty.contains("no EAPOL or PMKID material found in capture"), "{rendered_empty}"); + } } diff --git a/src/store/essid.rs b/src/store/essid.rs index a46fb7f..674f1a2 100644 --- a/src/store/essid.rs +++ b/src/store/essid.rs @@ -230,6 +230,17 @@ impl EssidMap { self.map.len() } + /// Returns the largest number of distinct SSID variants recorded for any + /// single AP, or 0 when the map is empty. + /// + /// Feeds the Phase 3 `ESSID changes (per-AP maximum)` banner row (variants + /// minus the initial SSID): the same per-AP fan-out that drives the + /// multi-ESSID inflation filter, surfaced as a capture-quality signal. + #[must_use] + pub fn max_ssid_variants(&self) -> usize { + self.map.values().map(Vec::len).max().unwrap_or(0) + } + /// Coarse heap + struct-bytes estimate for `--mem-stats` reporting. /// /// Sums `HashMap` bucket overhead, every `Vec` allocation, diff --git a/tests/integration/generated_corpus.rs b/tests/integration/generated_corpus.rs index b1bc69d..999525c 100644 --- a/tests/integration/generated_corpus.rs +++ b/tests/integration/generated_corpus.rs @@ -89,6 +89,28 @@ fn corpus_root_exists() { assert!(root.exists(), "missing corpus root -- run wpawolf-fixturegen first"); } +/// Packet-accounting invariant (STATS.md reconciliation identity 1): every packet +/// wpawolf reads reaches exactly one terminal disposition, so `total_packets` +/// equals the sum of the eight per-packet disposition counters. The banner +/// surfaces any break as a "packets unaccounted (BUG)" / "frames multi-counted +/// (BUG)" row. This drives the whole generated corpus through the binary -- which +/// exercises management/data/control/extension frames plus the link-error and +/// unknown-DLT drop paths -- and asserts neither BUG row ever appears. The spawned +/// binary is a debug build, so `run()`'s `debug_assert_eq!` is a second backstop: +/// a break aborts the process and `run_wpawolf_capture_stats`'s success assert +/// fails. +#[test] +fn packet_accounting_holds_across_generated_corpus() { + let root = Path::new(CORPUS_ROOT); + if !root.exists() { + return; // corpus not generated; `corpus_root_exists` covers that case + } + let banner = run_wpawolf_capture_stats(root); + assert!(banner.contains("packets total"), "no banner captured:\n{banner}"); + assert!(!banner.contains("unaccounted (BUG"), "packet accounting broke:\n{banner}"); + assert!(!banner.contains("multi-counted (BUG"), "frames were multi-counted:\n{banner}"); +} + #[test] fn manifest_is_present() { let manifest = Path::new(CORPUS_ROOT).join("ground_truth/manifest.toml"); diff --git a/tests/integration/log_categories_coverage.rs b/tests/integration/log_categories_coverage.rs index b8f2730..f2021b2 100644 --- a/tests/integration/log_categories_coverage.rs +++ b/tests/integration/log_categories_coverage.rs @@ -404,8 +404,8 @@ fn invalid_protocol_version_is_forgiven_not_logged() { // A frame with FC bit B0 set has Protocol Version = 1 (reserved per §9.2.4.1.1). // We forgive the version anomaly (every 802.11 amendment through 2024 reuses // the v=0 MAC layout) and do NOT emit a [malformed_frame] entry; the operator - // sees the count via the Phase 1 summary line "frames with non-zero Protocol - // Version (forgiven)". This matches tshark / wireshark's lenient dissection. + // sees the count via the Phase 1 summary line "non-zero Protocol Version + // (forgiven; processed)". This matches tshark / wireshark's lenient dissection. let pcap = "/tmp/wpawolf_logcov_protover.pcap"; let log = "/tmp/wpawolf_logcov_protover.log"; @@ -440,7 +440,7 @@ fn invalid_protocol_version_is_forgiven_not_logged() { // Phase 1 stats summary must include the forgiven count. let stdout_contents = fs::read_to_string(&stdout_path).unwrap(); assert!( - stdout_contents.contains("frames with non-zero Protocol Version (forgiven"), + stdout_contents.contains("non-zero Protocol Version (forgiven"), "expected stats line in stdout; got:\n{stdout_contents}" ); } diff --git a/tools/audit_stats.sh b/tools/audit_stats.sh new file mode 100755 index 0000000..19f102d --- /dev/null +++ b/tools/audit_stats.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# +# audit_stats.sh -- drift gate between STATS.md (the banner contract) and the +# counters that actually exist in the code. +# +# Two directions: +# 1. Every public field of `Stats` (src/stats.rs) and `FragmentStats` +# (src/store/fragments.rs) must be named in backticks somewhere in +# STATS.md. Per-sink field families that STATS.md documents once per family +# (`lines_*`, `dropped_*`, `path_*`, `reassoc_req_*`) are exempted by prefix. +# 2. Every `stats.` reference in STATS.md or ARCHITECTURE.md must be a +# real field -- catches a renamed or deleted counter leaving a stale +# documentation reference behind. +# +# Wired into `make check-all` via the `audit-stats` target. Exits non-zero on +# any drift so CI fails before a stale contract lands. +set -euo pipefail + +cd "$(dirname "$0")/.." + +contract="STATS.md" +arch="ARCHITECTURE.md" +stats_src="src/stats.rs" +fragments_src="src/store/fragments.rs" + +for f in "${contract}" "${stats_src}" "${fragments_src}"; do + if [[ ! -f ${f} ]]; then + echo "audit_stats: missing ${f}" >&2 + exit 1 + fi +done + +fields_file="$(mktemp)" +trap 'rm -f "${fields_file}"' EXIT + +# Public struct fields: ` pub : ,` lines. `pub fn` does not match +# because the function name is not followed directly by a colon. +grep -hoE '^[[:space:]]*pub [a-z0-9_]+:' "${stats_src}" "${fragments_src}" \ + | awk '{print $2}' | tr -d ':' | sort -u > "${fields_file}" + +fail=0 + +# --- Direction 1: every code field must be named in the contract --- +# Accepted forms: `field` or `stats.field` in backticks anywhere in STATS.md. +while IFS= read -r field; do + case "${field}" in + lines_* | dropped_* | path_* | reassoc_req_*) continue ;; + *) ;; + esac + if ! grep -qE "\`(stats\.|fragment_stats\.)?${field}\`" "${contract}"; then + echo " UNDOCUMENTED: ${field} (exists in code, not named in ${contract})" + fail=1 + fi +done < "${fields_file}" + +# --- Direction 2: every `stats.` doc reference must exist in code --- +# `stats.rs` is the source-file name, not a field reference. +refs="$(grep -hoE 'stats\.[a-z0-9_]+' "${contract}" "${arch}" | sort -u || true)" +while IFS= read -r ref; do + # Skip source-file / script-name fragments, not field references. + [[ -z ${ref} || ${ref} == "stats.rs" || ${ref} == "stats.sh" ]] && continue + field="${ref#stats.}" + if ! grep -qx "${field}" "${fields_file}"; then + echo " STALE DOC REF: ${ref} (referenced in docs, no such field)" + fail=1 + fi +done <<< "${refs}" + +field_count="$(wc -l < "${fields_file}")" +if [[ ${fail} -ne 0 ]]; then + echo "audit_stats: FAIL -- ${contract} and the code disagree (see above)" + exit 1 +fi +echo "audit_stats: OK -- ${field_count} counter fields reconciled against ${contract}" From 55049dd5b90b91acfbebf3bb7c81578c345c167f Mon Sep 17 00:00:00 2001 From: StrongWind <5987034+StrongWind1@users.noreply.github.com> Date: Wed, 10 Jun 2026 17:05:56 -0400 Subject: [PATCH 13/14] docs: document the stats contract and recent fixes README: add the STATS.md reference, refresh the sample banner, and correct the runtime-crate count and test totals. CONTRIBUTING: point the audit-stats gate description at STATS.md and update the test count. CHANGELOG: record the stats banner rebuild, the packet-accounting enforcement, and the MLD additive-canonicalization fix. --- CHANGELOG.md | 70 +++++++++++++++++++++++++++++-------------------- CONTRIBUTING.md | 10 +++---- README.md | 37 +++++++++++++++----------- 3 files changed, 67 insertions(+), 50 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 19e923c..4f542cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,18 +4,29 @@ This file is a current-state summary of `wpawolf` rather than a per-release diar ## Releases -### v0.4.0 -- unreleased (`feat/adaptive-pipeline`) +### v0.4.0 -- unreleased -Rayon-based parallel pairing with streaming per-group fan-out, cross-platform memory monitoring via `sysinfo`, per-packet buffer recycling, and `FtFields` boxing. No change to hashcat-line output format; 22000 / 37100 / per-AKM lines are byte-identical to v0.3.10 for any capture. +Rayon-based parallel pairing with streaming per-group fan-out, automatic disk-backed fallback under memory pressure, Linux cooked capture (SLL/SLL2) support, per-packet CRC-32 FCS validation with tiered recovery of corrupt link-layer headers, out-of-order MSDU fragment reassembly, a `--log` triage redesign, and removal of `--per-file`. No change to hashcat-line output format; 22000 / 37100 / per-AKM lines are byte-identical to v0.3.10 for any capture. - **Streaming per-group fan-out eliminates the all-pairs Vec.** `emit_inner` no longer collects every `PairedHash` across all groups into a single `Vec` before iterating. Instead, `pair_all_groups_streaming()` delivers each group's pairs via a callback that locks a `Mutex`, fans out (ESSID resolution + hash classification + dedup check + buffered write), and releases. Peak memory drops by `sizeof(PairedHash) * total_pairs` -- roughly 540 MB on a 2.25B-pair corpus. Pairs are dropped at callback exit. - **Rayon work-stealing replaces manual `std::thread::scope` + LPT scheduling.** `pair_all_groups_streaming()` runs pairing across a per-run rayon thread pool (`--threads N`). Work-stealing handles load imbalance naturally (heavy groups don't block the tail). The Mutex serializes only the I/O fan-out (microseconds per group); pairing itself runs fully lock-free across cores. -- **Cross-platform memory monitoring via `sysinfo`.** `current_rss_bytes()`, `current_rss_mib()`, `total_ram_bytes()`, and `ram_info()` replace Linux-only `/proc/self/status` and `/proc/meminfo` parsing. Works on Linux, macOS, and Windows. Process aborts with a clear "approaching OOM" message if RSS exceeds 80% of system RAM during Phase 1 ingestion or Phase 4 pairing. -- **`--max-eapol-per-type` removed.** No per-type message cap; all messages flow into the pairing engine. Use `--per-file` to bound RSS on large corpora. -- **New runtime dependencies: `rayon` 1.x (173M+ downloads, parallel iteration) and `sysinfo` 0.39 (cross-platform memory queries).** Both pass `cargo deny` license checks (MIT/Apache-2.0). Dep count goes from 2 to 4. +- **Cross-platform memory monitoring via `sysinfo`.** `current_rss_bytes()`, `current_rss_mib()`, `total_ram_bytes()`, and `ram_info()` replace Linux-only `/proc/self/status` and `/proc/meminfo` parsing. Works on Linux, macOS, and Windows. RSS crossing 80 % of system RAM (override via `WPAWOLF_MEM_THRESHOLD`) during Phase 1 ingestion or Phase 4 emission now triggers the disk-backed fallback below instead of aborting the process. +- **Disk-backed fallback for `MessageStore`, `PmkidStore`, and hash-line dedup.** When `MemMonitor` trips the 80 % threshold it sets a sticky disk-mode flag: both stores spill to temp-file storage (`src/store/disk_messages.rs`, fixed-header binary records, no serde) and Phase 4 streams groups back one at a time; per-sink dedup falls over to partitioned fingerprint bucket files with write-through output and a post-run cleaning pass that rewrites each sink without the temporary duplicates (`src/output/disk_dedup.rs`). Disk writers are flushed before Phase 4 starts so the pairing pass always sees a complete store. Corpus-scale runs degrade to disk speed instead of dying. +- **Linux cooked capture support (DLT 113 SLL, DLT 276 SLL2).** `src/link/sll.rs` dispatches on the SLL `sll_hatype` ARPHRD value: 801 (raw 802.11), 802 (Prism header, including AVS-within-Prism via the existing magic detection), 803 (radiotap). SLL2's 20-byte header (vs 16 for SLL v1) shares the same dispatch. Captures taken on the Linux `any` pseudo-device now produce hashes instead of being dropped as unsupported DLTs. +- **Per-packet CRC-32 FCS validation on every DLT.** The 802.11 FCS is standard CRC-32 (ISO 3309); `crc32(data || fcs)` always yields the residue constant `0x2144DF1C`, so `link::fcs::resolve()` verifies FCS presence in one pass and classifies every frame into five counted outcomes (`fcs_header_and_crc_agree`, `fcs_detected_by_crc`, `fcs_badfcs_flagged`, `fcs_crc_mismatch_no_flag`, `fcs_neither`). The quiet win is `fcs_detected_by_crc`: frames whose link-layer header never announced an FCS previously carried 4 trailing checksum bytes into the IE walker as frame-body data, occasionally mis-parsing tag/length pairs. +- **Tiered recovery for corrupt link-layer headers** (`src/link/recover.rs`). When `link::strip()` fails, Tier 2 recomputes the radiotap header length from the `it_present` bitmask (bails on the variable-size TLV / vendor-namespace bits); Tier 3 scans byte offsets 0-144 for the CRC-32 residue match that proves where the 802.11 frame starts (14-byte minimum slice kills null/FF-padding false positives). Counters: `recovered_tier2`, `recovered_tier3`. Radiotap parsing also no longer rejects non-zero `it_version` -- `it_len` is the field that actually determines the 802.11 offset; the anomaly is counted via `radiotap_version_nonzero` instead of dropping the frame. +- **Out-of-order 802.11 MSDU fragment reassembly.** `FragmentStore` buffers fragments per `(SA, RA, SeqNum)` in any arrival order and reassembles when the final fragment plus all predecessors are present. The `fragments_dropped_disorder` counter is gone; `fragments_incomplete` reports MSDUs whose missing fragments never appeared in the capture, and `fragments_dropped_safety_cap` keeps its paranoid-backstop role. +- **`--log` redesigned as a triage tool.** Per-event categories (`eapol_key_rejected`, `invalid_nonce`, `invalid_mic`, `invalid_pmkid`, `unknown_linktype`, `capture_read_error`, `skipped_input`, `essid_not_found_summary`) write immediately and carry `file=` / `frame=` context; the `ts=` field is dropped. High-volume categories (`plcp_error`, `malformed_frame`, `essid_control_bytes`, `unknown_akm`) aggregate during the run and flush as per-reason summary lines with counts. Obvious high-volume rejections (null-kind nonces / MICs / PMKIDs, out-of-sequence timestamps, FCS outcomes, recovery tiers) are stats-banner-only and no longer flood the log. Key=value quoting is normalised: non-integer values are double-quoted, hex-only values stay bare. +- **`--max-eapol-per-type` removed.** No per-type message cap; all messages flow into the pairing engine. RSS on large corpora is bounded by the disk-backed fallback. +- **`--per-file` removed.** The flag traded cross-file pairing for bounded memory; the disk-backed fallback bounds memory without giving up cross-file pairing, so the trade-off no longer buys anything. `--strict` accordingly shrinks to `--eapoltimeout=5 --rc-drift=8 --dedup-hash-combos --nc-dedup`. Scripts passing `--per-file` need the flag deleted. +- **New runtime dependencies: `rayon` 1.x (parallel iteration), `sysinfo` 0.39 (cross-platform memory queries), and `crc32fast` 1.x (SIMD CRC-32, promoted from transitive-via-`flate2` to direct for per-packet FCS validation).** All pass `cargo deny` license checks (MIT/Apache-2.0). Dep count goes from 2 to 5. - **Per-packet buffer recycling in pcap and pcapng readers.** `PcapReader` and `PcapngReader` reuse a single `Vec` across `next_packet()` calls via a new `PacketReader::recycle_buffer()` trait method. The caller returns the packet's data buffer after each iteration; the reader reuses the allocation for the next read. Eliminates ~32.5 million heap allocations on a 5.4 GB corpus (96.8M -> 64.3M total allocations, 34% reduction). The pcapng reader also eliminates a double `.to_vec()` in EPB parsing: the old code cloned the entire block body to free a borrow conflict, then cloned the packet data sub-slice; the new code inlines EPB field extraction into `read_next_block()` and copies only the packet data once into the recycled buffer. - **`EapolMessage`, `PairedHash`, and `PmkidEntry` box the cold `Option` field.** Changed from `Option` (58 bytes inline) to `Option>` (8 bytes, null-pointer optimized). Over 99.9% of instances carry `None` (FT-PSK / 802.11r is rare in real-world captures); the boxed form saves 50 bytes per struct. `MessageStore` footprint drops from 280 MiB to 227 MiB on the test corpus (-19%). Combined with buffer recycling, peak RSS drops ~14% (726 -> 623 MiB). -- `make check-all` passes clean. +- **Stats banner redesign.** The closing banner is now a formal contract ([`STATS.md`](STATS.md)) rendered by `Stats::summary_string` at a W=60 value column with a test-enforced 58-char label cap, four-class issue suffixes (`dropped` / `recovered` / `diagnostic` / `informational`), and no embedded `": "` inside labels so `awk -F': '` parsing stays unambiguous. New run-context and cost rows: `output filters active` (a WIDE run and a `--strict` run are now distinguishable from the banner alone), per-phase wallclock + total, `throughput (MiB/s)`, `peak RSS (MiB)`, `disk-backed fallback engaged`, and a one-line `hint (no hashes)` row naming the largest drop counter on zero-hash runs. Accounting fixes: `EAPOL pairs generated` is now labelled pre-dedup with a new post-dedup `EAPOL pairs written` row whose combo children actually sum to it (the old generated total over-counted by the PMKID dedup drops -- the dedup counter was shared); `dedup dropped (total)` gains `EAPOL pair duplicates` / `PMKID duplicates` children; `PMKIDs written (post-dedup)` and the `no FCS present` row complete the FCS and PMKID accounting. Sink rows render configured sinks only (plus `hash/auxiliary sinks not configured: N` one-liners), and auxiliary sinks report `entries written` for parity with the hash sinks' line counts. New counters: `mic_failure_deauths` (Deauth reason 14 per Table 9-90), `ft_status_r0kh_unreachable` / `ft_status_invalid_pmkid` / `ft_status_invalid_mde` / `ft_status_invalid_fte` (Auth status 52-55 per Table 9-92 -- each one explains a missing FT-PSK handshake), `packets_zeroed_timestamp`, `wep_encrypted_data` (ExtIV=0 split from the WPA family per §12.5.2.2), `probe_resp_ssid_unset` / `probe_resp_ssid_zeroed`, `essid_changes_max`, `identities_extracted` / `usernames_extracted`, `bytes_ingested`, and `eapol_kdv0` (KDV=0 relabelled "AKM-defined; SHA-384 families" per Table 12-11 instead of being lumped into the reserved bucket). Cosmetic fixes: SLL/SLL2 now render as `DLT_LINUX_SLL` / `DLT_LINUX_SLL2` in the file metadata instead of `DLT_UNKNOWN`; sub-millisecond session gaps print in microseconds instead of `0` ms; the M4/non-M4 rejection splits no longer print bare-zero sides; the memory-pressure notices moved from stderr to stdout per the stderr-silent convention. A new `make audit-stats` gate (wired into `check-all`) diffs every `Stats` / `FragmentStats` field against the banner contract in both directions, killing the stats-doc drift class permanently. +- **Stats consistency pass + [`STATS.md`](STATS.md) contract.** Audited every parse / decode / extract / pair / emit site for information gleaned but not counted, and closed every silent disposition so an operator can account for every packet and see every drop. New counters: `packets_unknown_linktype` and `truncated_after_header` (the two packet-level drops the main loop used to `continue` past silently), `band_other` (radiotap channel outside the three known bands), `mgmt_reserved_subtype` (so the management subtype rows reconcile to `mgmt_frames`), `mesh_control_malformed` (Mesh Control header with a reserved Address-Extension Mode -- previously a silent `None`), `pairs_time_filtered` / `pairs_rc_filtered` (pairs removed by the opt-in `--eapoltimeout` / `--rc-drift` filters, previously invisible), and `emit_dropped_unclassified_akm` / `emit_dropped_ft_no_context` (crack material dropped at emit because the AKM mapped to no 11-type or an FT line lacked its R0KH context -- both were bare `continue`s in the fan-out). The `--wordlist-scan` sink now appears in the Phase 4 auxiliary-sink list with its entry count. The full per-line catalogue -- label, backing field, spec source, why-we-care, and drop behaviour for every row, plus the four packet/pair/hash reconciliation identities -- moved out of `ARCHITECTURE.md §9` into the new tracked [`STATS.md`](STATS.md); `§9` is now a one-paragraph pointer, and `make audit-stats` checks the code against `STATS.md`. +- **MLD canonicalization is now additive (fixes a single-link-on-MLO miss).** When an AP advertises an 802.11be Multi-Link Element, wpawolf used to *rewrite* every handshake / PMKID / SSID key from the link MAC onto the MLD MAC. That silently broke single-link associations to one BSSID of an MLD: their PTK/MIC is derived under the **link** MAC, so the rewritten MLD-keyed line is uncrackable. Now `MessageStore` / `PmkidStore` / `EssidMap` canonicalization *adds* an MLD-keyed copy while *keeping* the original link-keyed form, so both reach output -- the multi-link case cracks under the MLD MAC, the single-link case under the link MAC, and the crackable one is always present (the same "emit every candidate" rule as the six N#E# combos). Surfaced by a corpus diff against `hcxpcapngtool`: it was the one handshake in 70.8M packets that hcx extracted and wpawolf missed; with the fix, wpawolf is a complete superset (0 misses). The additive `EssidMap` path also repairs a latent side effect of the old `mem::take` rebuild, which reset every per-SSID observation count to 1 and thereby defeated the `--essid-collapse` filter (it cannot find a dominant SSID when all counts look like count-1 bit-flips); preserving the counts restores the collapse, so RF-rotted SSID variants are dropped as designed while each handshake keeps its crackable dominant-SSID line. Disk-mode canonicalization is additive too. Banner labels updated to "(AP,STA) groups / SSID entries also keyed under MLD (link kept)". Output stays duplicate-free (FR-CORRECT-2: `sort | uniq -d` empty across the corpus). +- **Build / toolchain:** `make release` aliases the native optimised build; the CI MSRV check is pinned at Rust 1.95. +- 922 tests; `make check-all` passes clean. ### v0.3.10 -- 2026-05-16 @@ -44,7 +55,7 @@ OOM prevention, full diagnostic mode, stdout migration, and per-failure-reason E - **`--max-eapol-per-type N` cap prevents Phase 4 OOM on rotating-ANonce captures (default 2048).** Some APs retransmit M1 with a fresh `ANonce` on each attempt (rotating-ANonce firmware behaviour). Because each unique `ANonce` produces a distinct `eapol_frame` byte string, the existing dedup-on-insert gate does not collapse them and `MessageStore` grows unboundedly. Phase 4's O(M1 × M2) Cartesian product over these groups then exhausts RAM before any output is written. The new per-type cap limits each `(AP, STA)` group to `N` stored messages per EAPOL type independently (M1/M2/M3/M4 each capped separately). Frames beyond the cap return `Admission::TypeSaturated`, increment `eapol_type_saturated_dropped`, and fire a `[eapol_key_rejected]` log line the first time a `(pair, type)` combo saturates. Operators who need full fidelity on known-clean corpora can raise or disable the cap (`--max-eapol-per-type=0`). Corpus analysis on a 5.4 GB, 1 788-file multi-vendor corpus: only two `(AP, STA)` pairs hit the cap, both rotating-ANonce M1; uncapped (`--max-eapol-per-type=0`) run completed cleanly on a 16 GB machine with peak RSS 1 184 MiB; both saturated APs had 1 526 and 1 538 unique M1s respectively, safely within the 2 048 default. The per-type cap saturation counter and per-combo details appear in `--debug` output before Phase 4 starts. - **All output routed to stdout; stderr is now silent.** Progress lines, the stats banner, `--mem-stats` output, and all `[log]` category warnings previously written to stderr are now written to stdout. stderr produces no output under any input. This aligns `wpawolf` with standard UNIX pipeline conventions where all user-visible diagnostic text travels on the same stream as hash output only when `-o /dev/stdout` is used, and makes log capture trivial (`wpawolf ... > run.log 2>&1` becomes `wpawolf ... > run.log`). Hash files written to paths supplied by `-o`, `--22000-out`, etc. are unaffected -- those go to files, not stdout. - **`--debug` diagnostic mode.** Emits timestamped, phase-annotated lines to stdout that give the operator a real-time picture of Phase 1–4 progress without requiring a post-run `--log` file. Volume is bounded by design: per-group lines are suppressed unless `cost ≥ 50 000` (HEAVY groups); lighter groups are covered by a progress ticker every 5 000 completed groups; the fan-out loop emits a ticker every 500 000 pairs and a summary line after the loop. On a 282 000-group corpus this yields approximately 5 500 debug lines. Key diagnostic outputs: Phase transition start/done with RSS; per-file format/DLT/packet/EAPOL/PMKID deltas; pre-Phase-4 store summary with cost-tier breakdown (zero, low 1–999, medium 1k–49k, heavy ≥50k), saturated-pair list (each `(AP, STA, msg_type)` that hit the per-type cap), and top-25 groups by pairing cost; Phase 4 HEAVY-group start/done with pair count and elapsed time; Phase 4 fan-out progress and a post-loop summary showing `in`, `written`, `dedup_dropped`, `nc_collapsed`, and `nc_clusters`; Linux memory check at each per-file boundary (always prints `[MEMORY WARNING]` when system RAM exceeds 80 % regardless of whether `--debug` is set). -- **`[eapol_key_rejected]` log category.** A new `--log` category that fires at the `eapol_llc_invalid` increment site for frames that passed the LLC/packet-type gate (`EtherType 0x888E/0x88C7`, packet type = 3 = EAPOL-Key) but failed the EAPOL-Key parser for a structural reason other than a garbage nonce or MIC (those are already fully captured by `[invalid_nonce]` / `[invalid_mic]`). Each line carries `timestamp_us`, `ap=`, `sta=`, `reason=` (one of `bad_llc_header`, `bad_ethertype`, `truncated_short`, `bad_descriptor_type`, `bad_kdv`, `truncated_24mic`, `classify_flags_invalid`), and `bytes=` (first 32 raw bytes in lowercase colon-hex for cross-referencing with tshark / Wireshark). On the ALL_CAPS corpus: 15 589 of the 15 624 `eapol_llc_invalid` frames are spec-correct M4 null-nonce drops already captured by `[invalid_nonce]`; the remaining 25 log entries represent 7 unique `(AP, STA)` frame patterns across 3 capture files (KDV=4 vendor extensions, non-standard descriptor types, one tshark-confirmed `[Malformed Packet]`). Corpus cross-check against hcxpcapngtool and tshark confirms zero salvageable data in any structural-failure category. +- **`[eapol_key_rejected]` log category.** A new `--log` category that fires at the `eapol_llc_invalid` increment site for frames that passed the LLC/packet-type gate (`EtherType 0x888E/0x88C7`, packet type = 3 = EAPOL-Key) but failed the EAPOL-Key parser for a structural reason other than a garbage nonce or MIC (those are already fully captured by `[invalid_nonce]` / `[invalid_mic]`). Each line carries `timestamp_us`, `ap=`, `sta=`, `reason=` (one of `bad_llc_header`, `bad_ethertype`, `truncated_short`, `bad_descriptor_type`, `bad_kdv`, `truncated_24mic`, `classify_flags_invalid`), and `bytes=` (first 32 raw bytes in lowercase colon-hex for cross-referencing with tshark / Wireshark). On a 5.4 GB multi-vendor corpus: 15 589 of the 15 624 `eapol_llc_invalid` frames are spec-correct M4 null-nonce drops already captured by `[invalid_nonce]`; the remaining 25 log entries represent 7 unique `(AP, STA)` frame patterns across 3 capture files (KDV=4 vendor extensions, non-standard descriptor types, one tshark-confirmed `[Malformed Packet]`). Corpus cross-check against hcxpcapngtool and tshark confirms zero salvageable data in any structural-failure category. - 844 tests (up from 833 in v0.3.7); `make check-all` passes clean. ### v0.3.7 -- 2026-05-12 @@ -145,12 +156,12 @@ Five explicit phases, each owned by a discrete module: | Phase | Module | Role | |---|---|---| | 1 Ingest | `src/input/` | pcap / pcapng / gzip readers | -| 2 Decode | `src/link/` + `src/ieee80211/` | radiotap/PPI/Prism/AVS strip; 802.11 frame, IE, RSN, EAPOL, EAP, FT parsing | +| 2 Decode | `src/link/` + `src/ieee80211/` | radiotap/PPI/Prism/AVS/SLL strip, CRC-32 FCS resolve, tiered recovery; 802.11 frame, IE, RSN, EAPOL, EAP, FT parsing | | 3 Extract | `src/extract/` + `src/store/` | per-subtype handlers populate AP / STA / EAPOL / PMKID / ESSID / aux stores | | 4 Emit | `src/pair/` + `src/output/` | N#E# pairing, hashcat 22000/37100 line formatting, dedup, wordlists | | 5 Report | `src/stats.rs` | operator-facing summary printed unconditionally on stdout | -Each `src/**/*.rs` carries a `//! Phase N -- ...` doc-comment naming its phase and the relevant ARCHITECTURE.md section. +Pipeline modules carry a `//!` doc-comment naming their phase and the relevant ARCHITECTURE.md section. ## Hash-output coverage (ARCHITECTURE.md §2) @@ -175,7 +186,7 @@ The SHA-384 family (types 8/9/10/11) is detected and counted in stats but not ye ## Input - pcap (all six magic variants including Kuznetzov-patched 24-byte per-packet headers), IXIA `lcap` hardware-capture and software-capture variants (4 magics, per wireshark `wiretap/libpcap.c`), pcapng (multi-SHB, multi-IDB, `if_tsresol`, `if_tsoffset`), gzip-wrapped variants of any of the above -- DLT 105 (raw 802.11), 127 (radiotap), 192 (PPI), 119 (Prism, including AVS-within-Prism detection), 163 (AVS, big-endian per spec) +- DLT 105 (raw 802.11), 127 (radiotap), 192 (PPI), 119 (Prism, including AVS-within-Prism detection), 163 (AVS, big-endian per spec), 113 (Linux cooked SLL, ARPHRD 801/802/803 dispatch), 276 (SLL2) - Streaming reader; never buffers more than one block - I/O errors abort; parse errors log-and-continue - Positional arguments may be files or directories. Directories are walked recursively and every regular file whose first 4 bytes match a supported capture-file magic (pcap microsecond / nanosecond / Kuznetzov in either byte order, pcapng SHB, or gzip) is added to the input set in deterministic (sorted) order. File extensions are not consulted. Symlinks are not followed. @@ -184,7 +195,7 @@ The SHA-384 family (types 8/9/10/11) is detected and counted in stats but not ye Every spec-defined PMKID location S1-S20 is extracted: M1 Key Data KDE, M2 RSN IE, Association / Reassociation Request RSN IE, FT Authentication (S5/S6), FILS Authentication (S7/S8), PASN Authentication (S9/S10), FT Action frames (S11-S13), Probe Request (S14/S15), Beacon / ProbeResponse vendor deviation (S16/S17), Mesh Peering AMPE Chosen-PMK (S18/S19), OSEN/Hotspot-2.0 IE (S20). -NULL and 0xFF nonces, MICs, and PMKIDs are rejected unconditionally on every message type, including M4. M1 NULL MIC is spec-valid and not flagged. M4 NULL nonce is spec-valid on the wire per [IEEE 802.11-2024] §12.7.6.5 NOTE 9 but the resulting hash line is mathematically uncrackable (the live PTK depends on M2's `SNonce`, which the M4 frame does not carry); rejected at extract from v0.3.5 onward, matching hcxpcapngtool. See `ARCHITECTURE.md §5.10`. +Garbage-pattern nonces, MICs, and PMKIDs (`null`, `ff`, `repeat_1`, `repeat_2`, `repeat_4`) are rejected unconditionally on every message type, including M4. M1 NULL MIC is spec-valid and not flagged. M4 NULL nonce is spec-valid on the wire per [IEEE 802.11-2024] §12.7.6.5 NOTE 9 but the resulting hash line is mathematically uncrackable (the live PTK depends on M2's `SNonce`, which the M4 frame does not carry); rejected at extract from v0.3.5 onward, matching hcxpcapngtool. See `ARCHITECTURE.md §5.10`. ## CLI flags @@ -218,8 +229,7 @@ Auxiliary outputs and runtime knobs. | `--dedup-hash-combos` | output filter: collapse 6 N#E# combos to 3 unique per session | | `--nc-dedup` | output filter: collapse near-identical-nonce siblings into one FLAG_NC survivor | | `--nc-tolerance N` | NC-dedup cluster span tolerance (default 8, matches hashcat `NONCE_ERROR_CORRECTIONS`) | -| `--strict` | shortcut: `--eapoltimeout=5 --rc-drift=8 --dedup-hash-combos --per-file --nc-dedup` | -| `--per-file` | pair + emit per input file, then clear per-file stores (bounded memory) | +| `--strict` | shortcut: `--eapoltimeout=5 --rc-drift=8 --dedup-hash-combos --nc-dedup` | | `--threads N` | pairing thread count (default = CPU count) | | `--debug` | timestamped Phase 1-4 diagnostic output to stdout | | `--quiet` | suppress progress lines; closing banner still prints | @@ -230,22 +240,26 @@ Auxiliary outputs and runtime knobs. The defaults emit all 6 N#E# combos per session and apply no time or replay-counter filtering -- maximum hash yield. Output filter flags narrow that further. The closing stats summary on stdout is unconditional; there is no `--stats` toggle. -## Stats output (ARCHITECTURE.md §9) +## Stats output (see [`STATS.md`](STATS.md)) Five banner sections in pipeline order: ``` -=== Phase 1 -- Ingest === file metadata, packet count, link errors -=== Phase 2 -- Decode === mgmt/data/control split, per-band, KDV mix -=== Phase 3 -- Extract === per-subtype mgmt counters, per-AKM assoc/reassoc - breakdown, ESSID/PMKID/EAPOL discovery, - NULL/0xFF rejects, identities/usernames/devices -=== Phase 4 -- Emit === per-hash-type 11-row breakdown, pair-combo - counts, NC/LE/BE flags, dedup drops, output paths -=== Phase 5 -- Report === total hashes, distinct hash types observed +=== Phase 1 -- Ingest === file metadata, bytes/packets, integrity counters +=== Phase 2 -- Decode === frame-type split, encryption mix, recovery tiers, + FCS accounting, fragments, bands, KDV mix +=== Phase 3 -- Extract === per-subtype mgmt tree (incl. FT failure statuses, + MIC-failure deauths), ESSID block, plaintext + surfaces, EAPOL totals + garbage rejects, + PMKID S1-S20 sources +=== Phase 4 -- Emit === active output filters, per-hash-type 11-row + breakdown, pairs generated/written, dedup split, + configured sinks with line/entry counts +=== Phase 5 -- Report === total hashes (EAPOL/PMKID split), wallclock, + throughput, peak RSS, disk-mode, zero-hash hint ``` -The per-hash-type breakdown leads Phase 4 and prints one row per `HashType` variant with the canonical 11-type name verbatim (`WPA2-PSK-EAPOL`, `FT-PSK-PMKID`, etc.). +Rows render at a fixed W=60 value column with dotted leaders; the full row-by-row contract (label, backing field, spec source, why-we-care, drop behaviour) is [`STATS.md`](STATS.md), enforced by `make audit-stats`. The per-hash-type breakdown prints one row per `HashType` variant with the canonical 11-type name verbatim (`WPA2-PSK-EAPOL`, `FT-PSK-PMKID`, etc.). ## Parity oracle @@ -257,12 +271,12 @@ The parity test parses the oracle banner, refuses stale versions, and hard-fails ## Quality bar -- 904 tests (unit + binary + integration, including a superset oracle asserting `wpawolf_output >= hcxpcapngtool_output` on every fixture with `hcxpcapngtool >= 7.0.1`, a cross-file pairing oracle confirming the shared `MessageStore` reassembles handshakes split across pcap files, and the `generated_corpus` oracle that runs wpawolf against every fixture produced by the in-tree `wpawolf-fixturegen` workspace member) +- 920 tests (unit + binary + integration, including a superset oracle asserting `wpawolf_output >= hcxpcapngtool_output` on every fixture with `hcxpcapngtool >= 7.0.1`, a cross-file pairing oracle confirming the shared `MessageStore` reassembles handshakes split across pcap files, and the `generated_corpus` oracle that runs wpawolf against every fixture produced by the in-tree `wpawolf-fixturegen` workspace member) - Sibling workspace crate `tools/fixturegen` emits a deterministic pcap/pcapng corpus covering all 11 hash types, the 20 PMKID extraction sites, the 6 N#E# combos, and the link-layer / container variants. Crypto primitives anchored to KAT vectors - Strict clippy: `pedantic`, `nursery`, `cargo` enabled; `-D warnings` - `#![forbid(unsafe_code)]` at crate root - `cargo deny` gates the supply chain (OSI-approved permissive licences only) -- `make check-all` runs `fmt`, `clippy`, `audit`, `test`, `doc -D warnings`, ASCII / LF hygiene, `cargo machete`. Required green before any commit +- `make check-all` runs `fmt`, `clippy`, `audit` (cargo deny), `audit-citations`, `audit-stats` (`STATS.md` banner contract vs `src/stats.rs`), `check`, `test`, `doc -D warnings`, ASCII / LF hygiene, `cargo machete`. Required green before any commit - An external multi-GB regression dataset (out-of-tree) is exercised opportunistically before each release; it confirms content-level superset of hcxpcapngtool on real-world traffic that is too noisy or legally-encumbered to commit ## Performance @@ -289,8 +303,6 @@ The parity test parses the oracle banner, refuses stale versions, and hard-fails - Inner-EAP hash extraction (EAP-MD5 / LEAP / MSCHAPv2 -- v2) - Legacy hashcat formats (hccap, hccapx, mode 2500) - WEP, pure SAE, OWE hash-line emission (parsed and counted, not emitted) -- FCS validation -- Memory-usage estimation in stats (use `/usr/bin/time -v`) ## Trajectory @@ -307,7 +319,7 @@ Wire-level correctness improvements that landed alongside the nine-sink surface: - **PMKID-AKM decoupling** for vendor M1 quirks. Various consumer router firmware regularly emits an M1 with KDV=1 (HMAC-MD5 MIC) AND a PMKID KDE in Key Data: a wire-level inconsistency where the descriptor type is RSN (0x02) but the MIC algorithm is the legacy WPA1 one. The PMKID itself is still computed with the AKM-defined PRF (HMAC-SHA1 for AKM 2), so it remains crackable. wpawolf promotes the PMKID's `AkmType` from `Wpa1` to `Wpa2Psk` while keeping the EAPOL classification as `Wpa1`. Without this, `from_akm_and_attack(Wpa1, is_pmkid=true) -> None` would silently drop the PMKID at output. - **A-MSDU subframe iteration** (`src/ieee80211/amsdu.rs`). 802.11n aggregated MSDUs are dispatched dual-path: the outer body is always parsed as a single MSDU (catches glitched A-MSDU bits on what is actually a complete single-MPDU EAPOL frame), then the A-MSDU bit (QoS Control byte 0 bit 7) drives a subframe walk that surfaces EAPOL hidden in subframes 2..N. - **FCS detection and tail-strip**. `src/link/radiotap.rs::has_fcs` reads radiotap Flags bit 4 (`0x10` = `IEEE80211_RADIOTAP_F_FCS`) and `src/link/mod.rs::strip` returns `(payload, had_fcs)` so the trailing 4-byte FCS is chopped before IE walking, preventing tag/length mis-parse on captures whose radio appended an FCS. -- **MSDU fragment reassembly** (`src/store/fragments.rs`). Per-(SA, RA, SeqNum) buffer of non-final fragments; `take_completed` returns the concatenated MSDU body when the final fragment arrives. Most EAPOL fits in one MPDU but FT-PSK M2 with extended IEs occasionally fragments. Stats: `fragment_stats.{seen, reassembled, dropped_disorder, dropped_overflow}`. +- **MSDU fragment reassembly** (`src/store/fragments.rs`). Per-(SA, RA, SeqNum) buffer of fragments accepted in any arrival order; the concatenated MSDU body is returned once the final fragment plus all predecessors are present. Most EAPOL fits in one MPDU but FT-PSK M2 with extended IEs occasionally fragments. Stats: `fragment_stats.{fragments_seen, fragments_reassembled, fragments_incomplete, fragments_dropped_safety_cap}`. - **Cross-file pairing test** (`tests/integration/cross_file_pairing.rs`). Splits a real-world capture at its midpoint into two files, runs wpawolf on the directory, asserts the output set matches the single-file baseline. Regression oracle for the shared-`MessageStore` invariant that lets handshakes survive `tcpdump`-rolled capture boundaries. Documentation split into a six-doc layout: @@ -317,6 +329,6 @@ Documentation split into a six-doc layout: - [`CHANGELOG.md`](CHANGELOG.md) (this file) -- per-release summary of what shipped, what changed, and what was removed. - [`HASHCAT-CURRENT-FORMATS.md`](HASHCAT-CURRENT-FORMATS.md) -- every WPA-PSK hash format current hashcat understands today (modes 22000 + 37100), the four legacy prefixes, the `keyver` byte trick, message-pair byte (EAPOL + PMKID), per-row mapping of the 11 wpawolf types onto current hashcat, support matrix, limitations. - [`HASHCAT-NEW-FORMATS.md`](HASHCAT-NEW-FORMATS.md) -- why the 11-type classification exists and how each row works: encoding rules, per-type cracker math (PBKDF2 -> PMKID / PTK / MIC), hash-line layout (16 B vs 24 B MIC, FT extras), N#E# vs M#E# notation, complete message-pair byte specification. -- [`HASHCAT-PROPOSED-CHANGES.md`](HASHCAT-PROPOSED-CHANGES.md) -- sketch of a unified `mode 22001` consuming all 11 types: parsed-line struct widening, loader dispatch, per-kernel work items, migration path. +- [`HASHCAT-PROPOSED-CHANGES.md`](HASHCAT-PROPOSED-CHANGES.md) -- sketch of two new modes (22002 passphrase-side, 22003 PMK-side) consuming all 11 types: parsed-line struct widening, loader dispatch, per-kernel work items, migration path. Known follow-ups: dedicated hashcat 24 B MIC kernel for the SHA-384 family (types 8-11) -- the lines are emitted today on the relevant per-AKM sinks but cannot be cracked without an upstream kernel update. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3eef570..c88ca42 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,7 +11,7 @@ Thanks for wanting to contribute. `wpawolf` is a narrow-scope tool (WPA/WPA2/WPA ``` wpawolf/ -├── src/ Rust source (input/, link/, ieee80211/, store/, pair/, output/) +├── src/ Rust source (input/, link/, ieee80211/, extract/, store/, pair/, output/) ├── tests/ Integration tests + binary fixtures (incl. tests/fixtures/generated/ corpus) ├── tools/fixturegen/ Workspace crate that emits the test-capture corpus (separate Cargo crate) ├── .github/ CI / Security / Release workflows + issue + PR templates @@ -26,7 +26,7 @@ wpawolf/ └── Makefile Developer workflow + cross-platform release builds ``` -The project runs strict clippy (pedantic + nursery + cargo) with zero warnings, and the test suite covers 904 cases across lib + binary + integration. An external multi-GB regression dataset (out-of-tree) is exercised opportunistically before each release on real-world traffic that is too noisy or legally encumbered to commit. +The project runs strict clippy (pedantic + nursery + cargo) with zero warnings, and the test suite covers 920 cases across lib + binary + integration. An external multi-GB regression dataset (out-of-tree) is exercised opportunistically before each release on real-world traffic that is too noisy or legally encumbered to commit. ## Before you open a PR @@ -35,7 +35,7 @@ make check-all make check-parity # only when touching pairing / output / extraction ``` -`make check-all` runs, in order: `fmt`, `clippy` (zero warnings), `cargo deny`, `cargo check`, `cargo test`, `cargo doc` with warnings-as-errors, ASCII hygiene, LF hygiene, and unused-dependency detection. A green `check-all` is required for review. +`make check-all` runs, in order: `fmt`, `clippy` (zero warnings), `cargo deny`, the `audit-citations` hcxpcapngtool line-citation check, the `audit-stats` banner-contract check ([`STATS.md`](STATS.md) vs `src/stats.rs`, both directions), `cargo check`, `cargo test`, `cargo doc` with warnings-as-errors, ASCII hygiene, LF hygiene, and unused-dependency detection. A green `check-all` is required for review. `make check-parity` re-runs the superset test against `hcxpcapngtool` with `CI=true` set, which converts a missing or stale oracle from a soft skip into a hard failure. Run this whenever you change anything in `src/pair/`, `src/output/`, `src/store/`, or `src/extract/`. @@ -68,7 +68,7 @@ If hcxpcapngtool is missing or older than 7.0.1, the test prints a clearly-tagge ## Dependency additions -Require a paragraph-long justification in the PR body addressing the rejected-crate policy in [`ARCHITECTURE.md §4`](ARCHITECTURE.md). Bar is high: target runtime dep count is 4 (`flate2`, `clap`, `rayon`, `sysinfo`). Dev-dependencies are less restrictive but still subject to `cargo deny` licence allow-list. +Require a paragraph-long justification in the PR body addressing the rejected-crate policy in [`ARCHITECTURE.md §4`](ARCHITECTURE.md). Bar is high: target runtime dep count is 5 (`flate2`, `crc32fast`, `clap`, `rayon`, `sysinfo`). Dev-dependencies are less restrictive but still subject to `cargo deny` licence allow-list. ## Adding a capture fixture @@ -76,7 +76,7 @@ Require a paragraph-long justification in the PR body addressing the rejected-cr - Over 1 MiB, keep out-of-tree and reference it from benchmarks only. - **Redact** real ESSIDs and client MAC addresses unless the capture comes from a lab network you control. wireshark's *Edit → Preferences → Name Resolution* + `editcap` can help. -The companion crate at [`tools/fixturegen/`](tools/fixturegen/) emits a deterministic 75-fixture pcap/pcapng corpus covering every (hash category × PMKID site × N#E# combo × link-layer × edge case) tuple, with cryptographically valid PMK / PMKID / MIC values — 117 of 123 lines crack end-to-end through hashcat 7.1.2 with PSK `hashcat!` (the 6 that don't are documented hashcat kernel limitations, see [`HASHCAT-CURRENT-FORMATS.md`](HASHCAT-CURRENT-FORMATS.md) §8.1). +The companion crate at [`tools/fixturegen/`](tools/fixturegen/) emits a deterministic 75-fixture pcap/pcapng corpus covering every (hash category × PMKID site × N#E# combo × link-layer × edge case) tuple, with cryptographically valid PMK / PMKID / MIC values — every legacy-sink line cracks end-to-end through hashcat 7.1.2 with PSK `hashcat!` except the documented kernel limitations (the PSK-SHA-256 PMKID lines and the two APLESS FT combos; see [`HASHCAT-CURRENT-FORMATS.md`](HASHCAT-CURRENT-FORMATS.md) §8). ## Reporting hashcat / hcxtools gaps diff --git a/README.md b/README.md index 2c28e0e..b6acd2a 100644 --- a/README.md +++ b/README.md @@ -22,15 +22,16 @@ ## Features -- **Pure safe Rust** - `#![forbid(unsafe_code)]`, four runtime crates (`flate2` + `clap` + `rayon` + `sysinfo`) +- **Pure safe Rust** - `#![forbid(unsafe_code)]`, five runtime crates (`flate2` + `crc32fast` + `clap` + `rayon` + `sysinfo`) - **Parallel pairing** - rayon work-stealing across CPU cores with streaming per-group fan-out - **Wide defaults** - emits every valid handshake; you filter at the end - **Cross-file pairing** - M1 in file A pairs with M2 in file B - **20 PMKID extraction sites** - every spec-defined location wired and counted -- **Deep frame walking** - A-MSDU subframes, MSDU fragment reassembly, radiotap FCS strip +- **Deep frame walking** - A-MSDU subframes, out-of-order MSDU fragment reassembly, CRC-32 FCS validation with tiered recovery of corrupt link-layer headers - **Garbage-pattern rejection** - nonces / MICs / PMKIDs checked against five pattern classes +- **Disk-backed fallback** - heavy stores spill to disk at 80 % RAM so corpus-scale runs finish instead of OOMing - **Fast** - >=200 MB/s on NVMe; Phase 1 I/O-bound, Phase 4 CPU-parallel -- **904 tests**; `make check-all` zero-warning under strict clippy +- **918 tests**; `make check-all` zero-warning under strict clippy --- @@ -45,14 +46,18 @@ hashcat -m 37100 hashes.37100 wordlist.txt Sample output (stats banner, truncated): ``` -=== Phase 4 -- Emit ========================================== -EAPOL pairs generated (total).......................: 142 - N1E2 challenge (ANonce M1, EAPOL M2)..............: 24 - N3E2 authorized (ANonce M3, EAPOL M2).............: 24 ---22000-out (legacy mode 22000).....................: hashes.22000 - lines written.....................................: 142 -=== Phase 5 -- Report ======================================== -hashes emitted (total)..............................: 154 +=== Phase 4 -- Emit ================================================== +output filters active.......................................: none (WIDE mode) +EAPOL pairs generated (total, pre-dedup)....................: 105 +EAPOL pairs written (post-dedup)............................: 105 + N1E2 challenge (ANonce from M1, EAPOL from M2)............: 22 + N3E2 authorized (ANonce from M3, EAPOL from M2)...........: 18 +--22000-out (legacy mode 22000).............................: hashes.22000 + lines written.............................................: 108 +=== Phase 5 -- Report ================================================ +hashes emitted (total)......................................: 147 +wallclock total (s).........................................: 0.4 +disk-backed fallback engaged................................: no ``` At least one output flag is required; `wpawolf` exits without doing any work if no output is configured. @@ -125,7 +130,7 @@ Both tools cover the same AKM scope (PSK and FT-PSK). The difference is default
Full flag reference (click to expand) -`wpawolf` accepts pcap, pcapng, and gzip captures (ten libpcap magic byte sequences including IXIA lcap variants). Positional arguments can be files or directories (walked recursively, magic-byte inclusion, extensions never consulted). Cross-file pairing is on by default; `--per-file` disables it. +`wpawolf` accepts pcap, pcapng, and gzip captures (ten libpcap magic byte sequences including IXIA lcap variants) over raw 802.11, radiotap, PPI, Prism, AVS, and Linux cooked (SLL / SLL2) link layers. Positional arguments can be files or directories (walked recursively, magic-byte inclusion, extensions never consulted). Cross-file pairing is always on: all EAPOL messages are collected across every input file before pairing runs. ### Hash output files @@ -167,19 +172,18 @@ The per-AKM sinks (`-o` and the six per-family flags) use an eleven-prefix forma | `--nc-tolerance N` | 8 | cluster span for `--nc-dedup` | | `--essid-collapse-min N` | 3 | SSID-variant collapse: min distinct SSIDs to trigger | | `--essid-collapse-ratio N` | 10 | SSID-variant collapse: top/second ratio threshold | -| `--strict` | off | bundle: `--eapoltimeout=5 --rc-drift=8 --dedup-hash-combos --per-file --nc-dedup` | +| `--strict` | off | bundle: `--eapoltimeout=5 --rc-drift=8 --dedup-hash-combos --nc-dedup` | ### Runtime options | Flag | Default | Meaning | |---|---|---| | `--threads N` | CPU count | Phase 4 worker count; `--threads=1` for reproducible output | -| `--per-file` | off | pair + emit + clear per input file; bounds RSS | | `--quiet` | off | suppress progress lines | | `--mem-stats` | off | per-store footprint table after closing banner | | `--debug` | off | timestamped phase/file/group diagnostic lines | -Progress lines print to stdout every 5 s or every 2M packets; `--quiet` silences them. RSS is reported cross-platform via `sysinfo`. Every run prints a Phase 1-5 stats summary unconditionally. Garbage-pattern nonces / MICs / PMKIDs are rejected at extract time; missing SSIDs drop at emit time. See [`ARCHITECTURE.md`](ARCHITECTURE.md) §4 and §9 for the full rejection and stats catalogue. +Progress lines print to stdout every 5 s or every 2M packets; `--quiet` silences them. RSS is reported cross-platform via `sysinfo`. Every run prints a Phase 1-5 stats summary unconditionally. Garbage-pattern nonces / MICs / PMKIDs are rejected at extract time; missing SSIDs drop at emit time. Every line of that summary -- its backing field, spec source, why it exists, and whether it drops packets -- is catalogued in [`STATS.md`](STATS.md).
@@ -195,7 +199,8 @@ See [`CONTRIBUTING.md`](CONTRIBUTING.md) for the development workflow, parity or | Document | Covers | |---|---| -| [ARCHITECTURE.md](ARCHITECTURE.md) | 5-phase pipeline, critical invariants, EAPOL pairing, PMKID extraction, stats catalogue, FR-* contracts | +| [ARCHITECTURE.md](ARCHITECTURE.md) | 5-phase pipeline, critical invariants, EAPOL pairing, PMKID extraction, FR-* contracts | +| [STATS.md](STATS.md) | the stats-banner contract: every line's field, spec source, reason, and drop behaviour | | [CHANGELOG.md](CHANGELOG.md) | per-release summary of what shipped | | [HASHCAT-CURRENT-FORMATS.md](HASHCAT-CURRENT-FORMATS.md) | modes 22000 + 37100 as they exist in hashcat today | | [HASHCAT-NEW-FORMATS.md](HASHCAT-NEW-FORMATS.md) | the 11 hash types: per-AKM cracker math, line layout, message-pair byte | From bd6e7beb6c543336f81e4afc574318e6b5fbdc51 Mon Sep 17 00:00:00 2001 From: StrongWind <5987034+StrongWind1@users.noreply.github.com> Date: Wed, 10 Jun 2026 19:00:22 -0400 Subject: [PATCH 14/14] feat: report per-hash-type found vs written in the stats banner The 11-type block counted only hashes written to a configured sink, so a type present in the capture but lacking a matching sink (the SHA-384 family with only --22000-out, FT without --37100-out) was invisible and the distinct-types count ran low. Each row now shows found / written: found is the sink-independent inventory of what the capture contains, written is what reached a file. Adds a found-but-not-written alert row and counts the inventory for distinct types observed. --- CHANGELOG.md | 3 +- STATS.md | 9 ++-- src/main.rs | 4 +- src/output/mod.rs | 49 +++++++++++++++++++-- src/stats.rs | 62 +++++++++++++++++++-------- tests/integration/generated_corpus.rs | 33 ++++++++++++++ 6 files changed, 131 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f542cd..915155d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,8 +25,9 @@ Rayon-based parallel pairing with streaming per-group fan-out, automatic disk-ba - **Stats banner redesign.** The closing banner is now a formal contract ([`STATS.md`](STATS.md)) rendered by `Stats::summary_string` at a W=60 value column with a test-enforced 58-char label cap, four-class issue suffixes (`dropped` / `recovered` / `diagnostic` / `informational`), and no embedded `": "` inside labels so `awk -F': '` parsing stays unambiguous. New run-context and cost rows: `output filters active` (a WIDE run and a `--strict` run are now distinguishable from the banner alone), per-phase wallclock + total, `throughput (MiB/s)`, `peak RSS (MiB)`, `disk-backed fallback engaged`, and a one-line `hint (no hashes)` row naming the largest drop counter on zero-hash runs. Accounting fixes: `EAPOL pairs generated` is now labelled pre-dedup with a new post-dedup `EAPOL pairs written` row whose combo children actually sum to it (the old generated total over-counted by the PMKID dedup drops -- the dedup counter was shared); `dedup dropped (total)` gains `EAPOL pair duplicates` / `PMKID duplicates` children; `PMKIDs written (post-dedup)` and the `no FCS present` row complete the FCS and PMKID accounting. Sink rows render configured sinks only (plus `hash/auxiliary sinks not configured: N` one-liners), and auxiliary sinks report `entries written` for parity with the hash sinks' line counts. New counters: `mic_failure_deauths` (Deauth reason 14 per Table 9-90), `ft_status_r0kh_unreachable` / `ft_status_invalid_pmkid` / `ft_status_invalid_mde` / `ft_status_invalid_fte` (Auth status 52-55 per Table 9-92 -- each one explains a missing FT-PSK handshake), `packets_zeroed_timestamp`, `wep_encrypted_data` (ExtIV=0 split from the WPA family per §12.5.2.2), `probe_resp_ssid_unset` / `probe_resp_ssid_zeroed`, `essid_changes_max`, `identities_extracted` / `usernames_extracted`, `bytes_ingested`, and `eapol_kdv0` (KDV=0 relabelled "AKM-defined; SHA-384 families" per Table 12-11 instead of being lumped into the reserved bucket). Cosmetic fixes: SLL/SLL2 now render as `DLT_LINUX_SLL` / `DLT_LINUX_SLL2` in the file metadata instead of `DLT_UNKNOWN`; sub-millisecond session gaps print in microseconds instead of `0` ms; the M4/non-M4 rejection splits no longer print bare-zero sides; the memory-pressure notices moved from stderr to stdout per the stderr-silent convention. A new `make audit-stats` gate (wired into `check-all`) diffs every `Stats` / `FragmentStats` field against the banner contract in both directions, killing the stats-doc drift class permanently. - **Stats consistency pass + [`STATS.md`](STATS.md) contract.** Audited every parse / decode / extract / pair / emit site for information gleaned but not counted, and closed every silent disposition so an operator can account for every packet and see every drop. New counters: `packets_unknown_linktype` and `truncated_after_header` (the two packet-level drops the main loop used to `continue` past silently), `band_other` (radiotap channel outside the three known bands), `mgmt_reserved_subtype` (so the management subtype rows reconcile to `mgmt_frames`), `mesh_control_malformed` (Mesh Control header with a reserved Address-Extension Mode -- previously a silent `None`), `pairs_time_filtered` / `pairs_rc_filtered` (pairs removed by the opt-in `--eapoltimeout` / `--rc-drift` filters, previously invisible), and `emit_dropped_unclassified_akm` / `emit_dropped_ft_no_context` (crack material dropped at emit because the AKM mapped to no 11-type or an FT line lacked its R0KH context -- both were bare `continue`s in the fan-out). The `--wordlist-scan` sink now appears in the Phase 4 auxiliary-sink list with its entry count. The full per-line catalogue -- label, backing field, spec source, why-we-care, and drop behaviour for every row, plus the four packet/pair/hash reconciliation identities -- moved out of `ARCHITECTURE.md §9` into the new tracked [`STATS.md`](STATS.md); `§9` is now a one-paragraph pointer, and `make audit-stats` checks the code against `STATS.md`. - **MLD canonicalization is now additive (fixes a single-link-on-MLO miss).** When an AP advertises an 802.11be Multi-Link Element, wpawolf used to *rewrite* every handshake / PMKID / SSID key from the link MAC onto the MLD MAC. That silently broke single-link associations to one BSSID of an MLD: their PTK/MIC is derived under the **link** MAC, so the rewritten MLD-keyed line is uncrackable. Now `MessageStore` / `PmkidStore` / `EssidMap` canonicalization *adds* an MLD-keyed copy while *keeping* the original link-keyed form, so both reach output -- the multi-link case cracks under the MLD MAC, the single-link case under the link MAC, and the crackable one is always present (the same "emit every candidate" rule as the six N#E# combos). Surfaced by a corpus diff against `hcxpcapngtool`: it was the one handshake in 70.8M packets that hcx extracted and wpawolf missed; with the fix, wpawolf is a complete superset (0 misses). The additive `EssidMap` path also repairs a latent side effect of the old `mem::take` rebuild, which reset every per-SSID observation count to 1 and thereby defeated the `--essid-collapse` filter (it cannot find a dominant SSID when all counts look like count-1 bit-flips); preserving the counts restores the collapse, so RF-rotted SSID variants are dropped as designed while each handshake keeps its crackable dominant-SSID line. Disk-mode canonicalization is additive too. Banner labels updated to "(AP,STA) groups / SSID entries also keyed under MLD (link kept)". Output stays duplicate-free (FR-CORRECT-2: `sort | uniq -d` empty across the corpus). +- **Per-hash-type breakdown now reports found vs written.** The 11-type block previously counted only hashes written to a configured sink, so a type with no matching sink -- the SHA-384 family (8-11) with only `--22000-out`, or FT (6-7) without `--37100-out` -- was invisible even when present in the capture, and "distinct hash types observed" undercounted. Each row is now `found / written`: `found` (new `hash_type_found`) is the sink-independent inventory of what the capture contains, `written` (`hash_type_emitted`) is what reached a file. "distinct hash types observed" now counts the inventory, and a new "hash types found but not written (add -o to capture)" row alerts when crackable material had no sink. Backed by a sink-independent global dedup so the inventory is unique per type (pre-dedup in disk mode, matching `written`); with `-o` configured, `found == written` for every type. - **Build / toolchain:** `make release` aliases the native optimised build; the CI MSRV check is pinned at Rust 1.95. -- 922 tests; `make check-all` passes clean. +- 923 tests; `make check-all` passes clean. ### v0.3.10 -- 2026-05-16 diff --git a/STATS.md b/STATS.md index d9b69fc..1999bd7 100644 --- a/STATS.md +++ b/STATS.md @@ -32,7 +32,7 @@ These are the sums an operator can check against the banner. They hold by constr 3. **Pair accounting (Phase 4).** `eapol_pairs_generated = eapol_pairs_useful + dedup_dropped_pairs`. The opt-in filters reduce the candidate set *before* generation, so `pairs_time_filtered` / `pairs_rc_filtered` are reported as their own lines, not folded into the gap. The per-combo `pairs_written_n*` children sum to `eapol_pairs_useful` (the written total), not to the generated total. -4. **Hash accounting (Phase 5).** `hashes emitted (total)` is the sum over the 11-type table (`hash_type_emitted`). Its `EAPOL hash lines` / `PMKID hash lines` children are the odd-code / even-code halves of that same table, so they always sum to the total. PMKID material that did not reach a sink is accounted by `dedup_dropped_pmkids`, `emit_dropped_unclassified_akm`, `emit_dropped_ft_no_context`, and the Phase-3 garbage/`essid_unresolved` drops. +4. **Hash accounting (Phase 5).** `hashes emitted (total)` is the sum of *written* hashes over the 11-type table (`hash_type_emitted`). Its `EAPOL hash lines` / `PMKID hash lines` children are the odd-code / even-code halves of that same table, so they always sum to the total. `hash_type_found` is the parallel *found* inventory, counted independently of which sinks were configured (so `found >= written` per type), and it drives `distinct hash types observed` and the `found but not written` row. PMKID material that did not reach a sink is accounted by `dedup_dropped_pmkids`, `emit_dropped_unclassified_akm`, `emit_dropped_ft_no_context`, and the Phase-3 garbage/`essid_unresolved` drops. ## Formatting contract @@ -199,7 +199,7 @@ Pairing, classification, dedup, and the per-sink output rows. Source: ARCHITECTU | Line | Field(s) | Source | Why we care | Disposition | |---|---|---|---|---| | output filters active | `filters_active` | FR-CLI-3 | Echoes the resolved filter state so a WIDE run and a `--strict` run are distinguishable from the banner alone. | informational | -| per-hash-type lines emitted (11 rows) | `hash_type_emitted` | §2 | One row per 11-type code -- exactly what hashcat will see, type by type. | skeleton | +| per-hash-type found / written (11 rows) | `hash_type_found`, `hash_type_emitted` | §2 | One row per 11-type code. **found** is the sink-independent inventory of what the capture contains; **written** is what reached a configured output file. They differ when a type has no configured accepting sink (e.g. the SHA-384 family with only `--22000-out` shows `14 / 0`). | skeleton | | EAPOL pairs generated (total, pre-dedup) | `eapol_pairs_generated` | §5 | Pairs the engine produced before global dedup (identity 3). | skeleton | | EAPOL pairs written (post-dedup) | `eapol_pairs_useful` | §5 | Pairs that survived dedup; the combo children sum to this. | skeleton | | per-combo written (N1E2 / N3E2 / N1E4 / N2E3 / N4E3 / N3E4) | `pairs_written_n1e2`, `pairs_written_n3e2`, `pairs_written_n1e4`, `pairs_written_n2e3`, `pairs_written_n4e3`, `pairs_written_n3e4` | §5 | Which N#E# combos produced output (AP-less N2E3/N4E3 included). | informational | @@ -225,8 +225,9 @@ Executive summary an operator reads in five seconds. | Line | Field(s) | Source | Why we care | Disposition | |---|---|---|---|---| -| hashes emitted (total) + EAPOL/PMKID split | `hash_type_emitted` (summed) | §2 | The headline yield, split by attack surface (identity 4). | skeleton | -| distinct hash types observed | `hash_type_emitted` (nonzero count) | §2 | How many of the 11 types this capture produced. | skeleton | +| hashes emitted (total) + EAPOL/PMKID split | `hash_type_emitted` (summed) | §2 | The headline yield written to files, split by attack surface (identity 4). | skeleton | +| distinct hash types observed | `hash_type_found` (nonzero count) | §2 | How many of the 11 types the capture contains -- the inventory, independent of which sinks were configured. | skeleton | +| hash types found but not written (add -o to capture) | `hash_type_found` vs `hash_type_emitted` | §2 | Types present in the capture that reached no output file; configure `-o` or the per-AKM sink to write them. | **dropped** (operator config) | | wallclock Phase 1-3 / Phase 4 / total | `wallclock_p13_ms`, `wallclock_p4_ms` | -- | Where the time went (streaming pass vs pairing+emit). | informational | | throughput (MiB/s) | `bytes_ingested`, `wallclock_p13_ms` | -- | Ingest rate against the FR-PERF-1 target. | informational | | peak RSS (MiB) | `peak_rss_mib` | -- | High-water memory (lower bound, sampled at the pressure-check cadence). | informational | diff --git a/src/main.rs b/src/main.rs index 09782b9..8800cc4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1002,8 +1002,10 @@ fn run(cli: &Cli) -> wpawolf::types::Result<()> { // Per-hash-type breakdown -- one bucket per row of the 11-type table in // `ARCHITECTURE.md §2`. The output pipeline classifies each emitted // line via `HashType::from_akm_and_attack`; copy the resulting tally into - // the global stats for `print_summary`. + // the global stats for `print_summary`. `hash_type_found` is the + // sink-independent inventory; `hash_type_emitted` is what reached a file. stats.hash_type_emitted = output_stats.hash_type_emitted; + stats.hash_type_found = output_stats.hash_type_found; // Auxiliary sink entry counts (filled by `finalize` from the writer returns). stats.entries_essid_list = output_stats.entries_essid as u64; diff --git a/src/output/mod.rs b/src/output/mod.rs index bfbd195..6dee808 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -193,6 +193,15 @@ pub struct OutputStats { /// `run_output` returns. pub hash_type_emitted: HashMap, + /// Unique crackable hashes *found* in the capture, keyed by `HashType`, + /// counted independently of which output sinks are configured. A hash whose + /// only candidate sinks are unconfigured (e.g. the SHA-384 family with just + /// `--22000-out`) is absent from `hash_type_emitted` but still counted here, + /// so the banner reports the full 11-type inventory of the capture's content. + /// Deduped via `found_dedup` in memory mode; pre-dedup (write-through) in + /// disk mode, matching `hash_type_emitted`. + pub hash_type_found: HashMap, + /// Hash lines emitted with an empty SSID because `essid_map` had no entry for /// the AP. Surfaces the residual hidden-SSID gap after Beacon, Probe Response, /// `AssocReq` / `ReassocReq`, directed Probe Request, and MLD canonicalization @@ -564,6 +573,11 @@ pub struct OutputContext { dedup: PerSinkDedup, sinks: HashSinks, disk_dedup: Option, + /// Sink-independent global dedup set used only to count `hash_type_found`: + /// every classified hash is fingerprinted here so the 11-type inventory + /// reflects what the capture contains, not just what reached a configured + /// sink. Unused in disk mode (found falls back to write-through counting). + found_dedup: dedup::DedupSet, /// APs whose hash lines we declined to emit because no ESSID was ever /// observed for them. Such lines are not crackable (hashcat needs the /// ESSID to derive the PMK), so they go to `--log` only -- nothing @@ -599,6 +613,7 @@ impl OutputContext { dedup: PerSinkDedup::new(), sinks: HashSinks::open(paths), disk_dedup: None, + found_dedup: dedup::DedupSet::new(), unresolved_drops: HashMap::new(), timestamp_ranges: HashMap::new(), } @@ -708,6 +723,7 @@ impl OutputContext { let dedup = &mut self.dedup; let sinks = &mut self.sinks; let disk_dedup = &mut self.disk_dedup; + let found_dedup = &mut self.found_dedup; let unresolved_drops = &mut self.unresolved_drops; // --- Pipeline 1: PMKIDs (Invariant OUT-1 -- always before EAPOL pairs) --- @@ -751,6 +767,12 @@ impl OutputContext { } for essid in ssids { + // Inventory count (sink-independent): tally this type as found + // whether or not a sink is configured to accept it. Deduped in + // memory; counted write-through in disk mode (like `emitted`). + if disk_dedup.is_some() || found_dedup.check_pmkid(&entry, essid) { + *stats.hash_type_found.entry(ht).or_insert(0) += 1; + } let item = FanItem::Pmkid { entry: &entry, ft: ft_ctx, essid }; let written = fan_out(sinks, dedup, disk_dedup, stats, ht, item)?; if written { @@ -776,13 +798,21 @@ impl OutputContext { sinks: &'a mut HashSinks, dedup: &'a mut PerSinkDedup, disk_dedup: &'a mut Option, + found_dedup: &'a mut dedup::DedupSet, stats: &'a mut OutputStats, unresolved_drops: &'a mut HashMap, first_error: Option, } - let emit_state = - std::sync::Mutex::new(EmitState { sinks, dedup, disk_dedup, stats, unresolved_drops, first_error: None }); + let emit_state = std::sync::Mutex::new(EmitState { + sinks, + dedup, + disk_dedup, + found_dedup, + stats, + unresolved_drops, + first_error: None, + }); let total_pairs_processed = std::sync::atomic::AtomicUsize::new(0); let nc_stats = @@ -798,8 +828,15 @@ impl OutputContext { } if any_sink { - let EmitState { sinks: s, dedup: d, disk_dedup: dd, stats: st, unresolved_drops: ud, first_error } = - &mut *guard; + let EmitState { + sinks: s, + dedup: d, + disk_dedup: dd, + found_dedup: fd, + stats: st, + unresolved_drops: ud, + first_error, + } = &mut *guard; for pair in &pairs { let Some(ht) = HashType::from_akm_and_attack(pair.akm, false) else { st.emit_dropped_unclassified_akm += 1; @@ -827,6 +864,10 @@ impl OutputContext { } for essid in ssids { + // Inventory count (sink-independent), as in Pipeline 1. + if dd.is_some() || fd.check_eapol(pair, essid) { + *st.hash_type_found.entry(ht).or_insert(0) += 1; + } let item = FanItem::Eapol { pair, ft: ft_ctx, essid }; match fan_out(s, d, dd, st, ht, item) { Ok(written) => { diff --git a/src/stats.rs b/src/stats.rs index 4d0adf5..4bd0818 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -842,6 +842,14 @@ pub struct Stats { // currently share a hashcat mode. /// Hash lines written, keyed by `HashType`. pub hash_type_emitted: HashMap, + /// Unique crackable hashes *found* in the capture, keyed by `HashType`, + /// independent of which output sinks were configured. Equals + /// `hash_type_emitted` for any type with a configured accepting sink; for a + /// type with no configured sink (e.g. the SHA-384 family with only + /// `--22000-out`) `hash_type_emitted` is 0 but this still counts what the + /// capture contained. Drives the per-type "found / written" rows and the + /// "distinct hash types observed" count. + pub hash_type_found: HashMap, // --- Extraction-side identity tallies --- /// Unique EAP identity strings extracted (RFC 3748 §5.1). Printed in Phase 3 @@ -1466,16 +1474,19 @@ impl Stats { } // Per-hash-type breakdown -- one row per `HashType` variant from the - // 11-type classification in ARCHITECTURE.md §2. Anchors every emitted hash - // line to a single (AKM, attack surface) so the operator can read off - // exactly what hashcat will see, type code by type code. - if self.hash_type_emitted.values().any(|&n| n > 0) { - let _ = writeln!(out, "per-hash-type lines emitted (per ARCHITECTURE.md §2):"); + // 11-type classification in ARCHITECTURE.md §2. The "found" column is the + // sink-independent inventory (what the capture contains); the "written" + // column is what reached a configured output file. They differ when a type + // has no configured accepting sink -- e.g. the SHA-384 family with only + // `--22000-out` shows "14 / 0": found in the capture, not written. + if self.hash_type_found.values().any(|&n| n > 0) { + let _ = writeln!(out, "per-hash-type found / written (per ARCHITECTURE.md §2):"); for ht in HashType::all() { - let n = self.hash_type_emitted.get(&ht).copied().unwrap_or(0); - if n > 0 { + let found = self.hash_type_found.get(&ht).copied().unwrap_or(0); + if found > 0 { + let written = self.hash_type_emitted.get(&ht).copied().unwrap_or(0); let label = format!(" {:>2}. {}", ht.type_code(), ht.name()); - stat!(label, n); + stat!(label, format!("{found} / {written}")); } } } @@ -1595,24 +1606,32 @@ impl Stats { // ====================================================================== section!(5, "Report"); - // Total hashes = sum of per-`HashType` counts (counted once per logical hash - // regardless of how many sinks it fanned out to). The EAPOL/PMKID children - // come from the same table -- odd type codes are EAPOL attacks, even codes - // are PMKID attacks per the ARCHITECTURE.md §2 encoding rule -- so the two - // children always sum to the total. Distinct types observed = number of - // `HashType` rows whose counter is non-zero. + // "emitted" = written to a configured sink; the EAPOL/PMKID children split + // it by odd/even type code per the ARCHITECTURE.md §2 encoding rule, so they + // always sum to the total. "found" = the sink-independent inventory of what + // the capture contained, so it can exceed "emitted" when a type had no + // configured accepting sink. "distinct hash types observed" counts the + // inventory (found), not just what was written. let total_hashes: u64 = HashType::all().map(|ht| self.hash_type_emitted.get(&ht).copied().unwrap_or(0)).sum(); let eapol_lines: u64 = HashType::all() .filter(|ht| ht.type_code() % 2 == 1) .map(|ht| self.hash_type_emitted.get(&ht).copied().unwrap_or(0)) .sum(); let pmkid_lines: u64 = total_hashes - eapol_lines; - let active_types = - HashType::all().filter(|ht| self.hash_type_emitted.get(ht).copied().unwrap_or(0) > 0).count(); + let found_types = HashType::all().filter(|ht| self.hash_type_found.get(ht).copied().unwrap_or(0) > 0).count(); + // Types present in the capture but not written anywhere -- the operator + // can add `-o` (or the matching per-AKM sink) to capture them. + let found_not_written = HashType::all() + .filter(|ht| { + self.hash_type_found.get(ht).copied().unwrap_or(0) > 0 + && self.hash_type_emitted.get(ht).copied().unwrap_or(0) == 0 + }) + .count() as u64; stat!("hashes emitted (total)", total_hashes); nz!(" EAPOL hash lines", eapol_lines); nz!(" PMKID hash lines", pmkid_lines); - stat!("distinct hash types observed", active_types); + stat!("distinct hash types observed", found_types); + nz!("hash types found but not written (add -o to capture)", found_not_written); // Run cost. Wallclock is split at the Phase 3 / Phase 4 boundary (the // streaming pass vs the pairing + emit pass); throughput is file bytes @@ -2187,10 +2206,15 @@ mod tests { s.pmkid_osen = 1; s.pmkid_wpa2_psk = 1; s.pmkid_ft_psk = 1; - // Phase 4. + // Phase 4. Every type is found; one type is left written=0 so the + // per-type "found / written" rows and the "found but not written" row + // both render for the label-width check. s.filters_active = "eapoltimeout=5, rc-drift=8, dedup-hash-combos, nc-dedup (tolerance 8)".to_owned(); for ht in HashType::all() { - *s.hash_type_emitted.entry(ht).or_insert(0) += 1; + *s.hash_type_found.entry(ht).or_insert(0) += 1; + if ht != HashType::FtPskSha384Eapol { + *s.hash_type_emitted.entry(ht).or_insert(0) += 1; + } } s.eapol_pairs_generated = 8; s.eapol_pairs_useful = 6; diff --git a/tests/integration/generated_corpus.rs b/tests/integration/generated_corpus.rs index 999525c..5ddc67e 100644 --- a/tests/integration/generated_corpus.rs +++ b/tests/integration/generated_corpus.rs @@ -111,6 +111,39 @@ fn packet_accounting_holds_across_generated_corpus() { assert!(!banner.contains("multi-counted (BUG"), "frames were multi-counted:\n{banner}"); } +/// The per-hash-type breakdown reports what the capture CONTAINS, not just what +/// reached an output file. The SHA-384 family (types 8-11) has no legacy sink, so +/// a run with only `--22000-out` writes none of it -- but the banner must still +/// report those types as *found* (so an operator knows the capture holds crackable +/// material their flags did not write). Drives a SHA-384 fixture with only +/// `--22000-out` and asserts the type appears in the found/written block with +/// written = 0, that "distinct hash types observed" still counts it, and that the +/// "found but not written" alert fires. +#[test] +fn sha384_types_reported_as_found_without_a_matching_sink() { + let fixture = Path::new(CORPUS_ROOT).join("11_types/type08_psksha384_pmkid.pcap"); + if !fixture.exists() { + return; // corpus not generated + } + let bin = binary_path(); + let out22000 = std::env::temp_dir().join(format!("wpawolf-sha384-{}-{}.22000", std::process::id(), nanos_unique())); + let _ = fs::remove_file(&out22000); + // ONLY --22000-out -- no -o, no --psk-sha384-out -- so SHA-384 has no sink. + let output = Command::new(&bin).arg("--22000-out").arg(&out22000).arg(&fixture).output().expect("spawn wpawolf"); + assert!(output.status.success(), "wpawolf failed on the SHA-384 fixture"); + let banner = String::from_utf8_lossy(&output.stdout).into_owned(); + + // The SHA-384 PMKID type is found in the capture even though no sink wrote it. + assert!(banner.contains("PSK-SHA384-PMKID"), "SHA-384 type missing from found block:\n{banner}"); + // The inventory counts it; the operator is told to add -o to capture it. + assert!(banner.contains("hash types found but not written"), "missing found-not-written alert:\n{banner}"); + assert!(banner.contains("distinct hash types observed"), "missing distinct-types row:\n{banner}"); + // Nothing was written to the only configured sink (lazy: file may not exist). + let written = fs::read_to_string(&out22000).map_or(0, |s| s.lines().count()); + assert_eq!(written, 0, "SHA-384 must not reach the --22000-out sink"); + let _ = fs::remove_file(&out22000); +} + #[test] fn manifest_is_present() { let manifest = Path::new(CORPUS_ROOT).join("ground_truth/manifest.toml");