diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 17443ce..38da336 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -97,6 +97,9 @@ jobs:
             - name: Test (vcan)
               if: steps.vcan.outputs.available == 'true'
               run: cargo llvm-cov --no-report nextest --all-features --run-ignored ignored-only
+            # Run clippy twice - once with the 1.89 MSRV, and once with the latest stable toolchain
+            - name: Clippy
+              run: cargo clippy --no-deps --all-targets --all-features
             - name: Coverage report
               run: |
                   cargo llvm-cov report --cobertura --output-path coverage.xml
@@ -106,9 +109,6 @@ jobs:
                   PERCENT="$(echo "($RATE * 100)/1" | bc)"
                   echo "PERCENT=$PERCENT"
                   echo "COVERAGE_PERCENT=$PERCENT" >> $GITHUB_ENV
-            # Run clippy twice - once with the 1.89 MSRV, and once with the latest stable toolchain
-            - name: Clippy
-              run: cargo clippy --no-deps --all-targets --all-features
             - name: Update coverage badge
               uses: schneegans/dynamic-badges-action@v1.7.0
               if: github.ref_name == github.event.repository.default_branch
@@ -125,6 +125,20 @@ jobs:
                 valColorRange: ${{ env.COVERAGE_PERCENT }}
                 minColorRange: 40
                 maxColorRange: 65
+            - name: Setup nightly toolchain (ASAN)
+              uses: dtolnay/rust-toolchain@master
+              with:
+                toolchain: nightly
+                components: rust-src
+            - name: Test (ASAN)
+              env:
+                RUSTFLAGS: -D warnings -Zsanitizer=address
+              run: cargo +nightly nextest run -Zbuild-std --target x86_64-unknown-linux-gnu --all-features --no-tests=warn
+            - name: Test (ASAN, vcan)
+              if: steps.vcan.outputs.available == 'true'
+              env:
+                RUSTFLAGS: -D warnings -Zsanitizer=address
+              run: cargo +nightly nextest run -Zbuild-std --target x86_64-unknown-linux-gnu --all-features --run-ignored ignored-only
 
     # Canary job: verifies vcan is available on the runner. Shows yellow when the
     # linux-modules-extra package drifts from the runner kernel version, which means the socketcan
diff --git a/Cargo.toml b/Cargo.toml
index bb161d9..33f546b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,9 +11,10 @@ version = "0.1.0-rc0"
 edition = "2024"
 license = "MIT"
 rust-version = "1.89"
-description = "Opinionated CAN utils written in Rust"
+description = "Opinionated CAN utilities written in Rust"
 
 [workspace.dependencies]
+assert_cmd = { version = "2.2.0", features = ["color-auto"] }
 ctor = "0.6"
 eyre = "0.6"
 gungraun = "0.17"
@@ -21,5 +22,6 @@ io-uring = "0.7"
 libc = "0.2"
 neli = "0.7"
 tabled = "0.18"
+tempfile = "3.27.0"
 tracing = "0.1"
 tracing-subscriber = "0.3"
diff --git a/README.md b/README.md
index dec4706..4215200 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 ![release workflow](https://github.com/Notgnoshi/candemonium/actions/workflows/release.yml/badge.svg?event=push)
 ![code coverage](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/Notgnoshi/55f3f6cae2abdc5d011d907624dfb883/raw/can-utils-rs-coverage.json)
 
-Opinionated CAN utils written in Rust.
+Opinionated CAN utilties written in Rust.
 
 ## Purpose
 
@@ -16,3 +16,9 @@ constraints.
 
 A modern-ish Linux with io_uring and socketcan available. A ~4 core ~1GHz arm64 CPU with 1GB memory
 and 4+ J1939 CAN networks.
+
+## Documentation
+
+* See [quickstart.md](/docs/developer/quickstart.md) for a developer quickstart
+* See `docs/design/` for design documents
+* See `docs/user/` for user documentation
diff --git a/candumpr/Cargo.toml b/candumpr/Cargo.toml
index 78554e3..08abbd4 100644
--- a/candumpr/Cargo.toml
+++ b/candumpr/Cargo.toml
@@ -13,12 +13,12 @@ ci = []
 eyre.workspace = true
 io-uring.workspace = true
 libc.workspace = true
+tracing.workspace = true
 
 [dev-dependencies]
 ctor.workspace = true
 gungraun.workspace = true
 tabled.workspace = true
-tracing.workspace = true
 tracing-subscriber.workspace = true
 vcan-fixture = { path = "../vcan-fixture" }
 
diff --git a/candumpr/benches/common/mod.rs b/candumpr/benches/common/mod.rs
index b295715..fa4ed7e 100644
--- a/candumpr/benches/common/mod.rs
+++ b/candumpr/benches/common/mod.rs
@@ -1,6 +1,6 @@
 use std::os::unix::io::{AsFd, BorrowedFd, OwnedFd};
 use std::sync::Arc;
-use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
 use std::time::{Duration, Instant};
 
 use candumpr::can::{self, CanFrame};
@@ -51,6 +51,38 @@ pub const BACKENDS: &[BackendDef] = &[
     },
 ];
 
+// --- Sequence checker ---
+
+fn frame_seq(frame: &CanFrame) -> u32 {
+    u32::from_le_bytes([frame.data[0], frame.data[1], frame.data[2], frame.data[3]])
+}
+
+struct SeqCheck {
+    expected: Vec<AtomicU32>,
+}
+
+impl SeqCheck {
+    fn new(n: usize) -> Self {
+        Self {
+            expected: (0..n).map(|_| AtomicU32::new(0)).collect(),
+        }
+    }
+
+    fn check(&self, idx: usize, frame: &CanFrame) {
+        let actual = frame_seq(frame);
+        let expected = self.expected[idx].load(Ordering::Relaxed);
+        if actual != expected {
+            tracing::warn!(
+                iface = idx,
+                received = actual,
+                expected = expected,
+                "out-of-sequence frame"
+            );
+        }
+        self.expected[idx].store(actual.wrapping_add(1), Ordering::Relaxed);
+    }
+}
+
 // --- Backend run functions ---
 //
 // Single-threaded backends: wrap the backend's run() with getrusage_thread() before/after.
@@ -58,12 +90,14 @@ pub const BACKENDS: &[BackendDef] = &[
 // aggregate the deltas.
 
 fn run_dedicated(sockets: Vec<OwnedFd>, stop: Arc<AtomicBool>, count: &AtomicU64) -> (u64, Rusage) {
+    let seq = SeqCheck::new(sockets.len());
     let backend = DedicatedRecv::new(sockets);
     let rusage = std::sync::Mutex::new(Rusage::default());
     let total = backend
         .run_instrumented(
             stop,
-            &|_idx, _frame, _meta| {
+            &|idx, frame, _meta| {
+                seq.check(idx, frame);
                 count.fetch_add(1, Ordering::Relaxed);
             },
             &|_idx, inner| {
@@ -78,10 +112,12 @@ fn run_dedicated(sockets: Vec<OwnedFd>, stop: Arc<AtomicBool>, count: &AtomicU64
 }
 
 fn run_epoll(sockets: Vec<OwnedFd>, stop: Arc<AtomicBool>, count: &AtomicU64) -> (u64, Rusage) {
+    let seq = SeqCheck::new(sockets.len());
     let mut backend = EpollRecv::new(sockets).unwrap();
     let before = getrusage_thread();
     let total = backend
-        .run(stop, &mut |_idx, _frame, _meta| {
+        .run(stop, &mut |idx, frame, _meta| {
+            seq.check(idx, frame);
             count.fetch_add(1, Ordering::Relaxed);
         })
         .unwrap();
@@ -90,10 +126,12 @@ fn run_epoll(sockets: Vec<OwnedFd>, stop: Arc<AtomicBool>, count: &AtomicU64) ->
 }
 
 fn run_recvmmsg(sockets: Vec<OwnedFd>, stop: Arc<AtomicBool>, count: &AtomicU64) -> (u64, Rusage) {
+    let seq = SeqCheck::new(sockets.len());
     let mut backend = RecvmmsgRecv::new(sockets).unwrap();
     let before = getrusage_thread();
     let total = backend
-        .run(stop, &mut |_idx, _frame, _meta| {
+        .run(stop, &mut |idx, frame, _meta| {
+            seq.check(idx, frame);
             count.fetch_add(1, Ordering::Relaxed);
         })
         .unwrap();
@@ -102,10 +140,12 @@ fn run_recvmmsg(sockets: Vec<OwnedFd>, stop: Arc<AtomicBool>, count: &AtomicU64)
 }
 
 fn run_uring(sockets: Vec<OwnedFd>, stop: Arc<AtomicBool>, count: &AtomicU64) -> (u64, Rusage) {
+    let seq = SeqCheck::new(sockets.len());
     let mut backend = UringRecv::new(sockets).unwrap();
     let before = getrusage_thread();
     let total = backend
-        .run(stop, &mut |_idx, _frame, _meta| {
+        .run(stop, &mut |idx, frame, _meta| {
+            seq.check(idx, frame);
             count.fetch_add(1, Ordering::Relaxed);
         })
         .unwrap();
@@ -118,10 +158,12 @@ fn run_uring_multi(
     stop: Arc<AtomicBool>,
     count: &AtomicU64,
 ) -> (u64, Rusage) {
+    let seq = SeqCheck::new(sockets.len());
     let mut backend = UringMultiRecv::new(sockets).unwrap();
     let before = getrusage_thread();
     let total = backend
-        .run(stop, &mut |_idx, _frame, _meta| {
+        .run(stop, &mut |idx, frame, _meta| {
+            seq.check(idx, frame);
             count.fetch_add(1, Ordering::Relaxed);
         })
         .unwrap();
@@ -155,18 +197,23 @@ fn sender_loop(
         }
         frame_idx += 1;
     }
+    // Let the receiver drain in-flight frames before signaling stop. Several receivers use 100ms
+    // as a timeout to wake themselves up. This isn't a great design, but it's possible to drop
+    // frames, so I can't just say "run until all frames have been received".
+    std::thread::sleep(Duration::from_millis(110));
     stop.store(true, Ordering::Relaxed);
 }
 
 fn make_frame(iface_idx: usize, frame_idx: u32) -> CanFrame {
+    let seq = frame_idx.to_le_bytes();
     CanFrame::new(
         ((iface_idx as u32) << 8) | (frame_idx & 0xFF) | libc::CAN_EFF_FLAG,
         &[
+            seq[0],
+            seq[1],
+            seq[2],
+            seq[3],
             iface_idx as u8,
-            frame_idx as u8,
-            0xDE,
-            0xAD,
-            0xBE,
             0xEF,
             0xCA,
             0xFE,
diff --git a/candumpr/examples/dump.rs b/candumpr/examples/dump.rs
new file mode 100644
index 0000000..382a63e
--- /dev/null
+++ b/candumpr/examples/dump.rs
@@ -0,0 +1,72 @@
+//! Listen on CAN interfaces using the io_uring multishot backend and print received frames.
+//!
+//! Usage: uring_multi_dump <iface> [iface...]
+
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use candumpr::can::{self, CanFrame};
+use candumpr::recv::uring_multi::UringMultiRecv;
+
+fn main() -> std::io::Result<()> {
+    tracing_subscriber::fmt()
+        .with_writer(std::io::stderr)
+        .with_max_level(tracing::Level::DEBUG)
+        .init();
+
+    let ifaces: Vec<String> = std::env::args().skip(1).collect();
+    if ifaces.is_empty() {
+        eprintln!("usage: uring_multi_dump <iface> [iface...]");
+        std::process::exit(1);
+    }
+
+    let sockets: Vec<_> = ifaces
+        .iter()
+        .map(|name| can::open_can_raw(name))
+        .collect::<std::io::Result<_>>()?;
+
+    let mut backend = UringMultiRecv::new(sockets)?;
+
+    let stop = Arc::new(AtomicBool::new(false));
+    let stop2 = stop.clone();
+    ctrlc(stop2);
+
+    let total = backend.run(stop, &mut |idx, frame, _meta| {
+        print_frame(idx, frame);
+    })?;
+
+    eprintln!("{total} frames received");
+    Ok(())
+}
+
+fn print_frame(idx: usize, frame: &CanFrame) {
+    let id = frame.can_id & !libc::CAN_EFF_FLAG & !libc::CAN_RTR_FLAG & !libc::CAN_ERR_FLAG;
+
+    print!("{idx} {id:08X} [{}]", frame.len);
+    for i in 0..frame.len as usize {
+        print!(" {:02X}", frame.data[i]);
+    }
+    println!();
+}
+
+/// Install a Ctrl-C handler that sets the stop flag.
+fn ctrlc(stop: Arc<AtomicBool>) {
+    unsafe {
+        libc::signal(
+            libc::SIGINT,
+            signal_handler as *const () as libc::sighandler_t,
+        );
+    }
+    // Leak the Arc into a raw pointer so the signal handler can access it.
+    STOP_FLAG.store(Arc::into_raw(stop) as *mut _, Ordering::Release);
+}
+
+static STOP_FLAG: std::sync::atomic::AtomicPtr<AtomicBool> =
+    std::sync::atomic::AtomicPtr::new(std::ptr::null_mut());
+
+extern "C" fn signal_handler(_sig: libc::c_int) {
+    let ptr = STOP_FLAG.load(Ordering::Acquire);
+    if !ptr.is_null() {
+        unsafe { &*ptr }.store(true, Ordering::Relaxed);
+    }
+}
diff --git a/candumpr/src/recv/uring_multi.rs b/candumpr/src/recv/uring_multi.rs
index 9c2bafe..e1aaf27 100644
--- a/candumpr/src/recv/uring_multi.rs
+++ b/candumpr/src/recv/uring_multi.rs
@@ -5,10 +5,8 @@
 //! Performance features:
 //! * SINGLE_ISSUER: skip internal synchronization (single-threaded use).
 //! * COOP_TASKRUN: prevent kernel from delivering task_work at arbitrary syscall boundaries.
-//! * DEFER_TASKRUN: defer all completion processing to explicit submit_with_args calls.
 //! * Registered file descriptors: avoid per-operation fd lookup in the kernel.
 //! * Batched wakeups: submit_with_args(BATCH_SIZE) reduces wakeup frequency.
-//! * Enlarged CQ ring: headroom for burst-induced multishot completions.
 //!
 //! Ancillary data:
 //! * Hardware timestamps (SCM_TIMESTAMPING) with software fallback.
@@ -25,8 +23,10 @@ use io_uring::{IoUring, cqueue, opcode, types};
 use crate::can::{self, CanFrame, FRAME_SIZE};
 use crate::recv::{FrameMeta, Timestamp};
 
-/// Number of provided buffers in the ring.
+/// Number of provided buffers (and CQ entries) in the ring. The CQ is sized to match so the
+/// kernel can post one completion per buffer without overflow. Must be a power of two.
 const FRAMEBUF_COUNT: u16 = 256;
+const _: () = assert!(FRAMEBUF_COUNT.is_power_of_two());
 
 /// Buffer group ID for the provided buffer ring. io_uring supports multiple buffer rings
 /// identified by group ID; we only use one.
@@ -37,10 +37,6 @@ const BGID: u16 = 0;
 /// when traffic is sparse.
 const BATCH_SIZE: usize = 4;
 
-/// CQ ring size. With multishot recv, a single SQE can generate many CQEs in a burst. A larger CQ
-/// ring prevents overflow (which terminates the multishot and forces resubmission).
-const CQ_SIZE: u32 = 64;
-
 /// Size of the `io_uring_recvmsg_out` header the kernel writes at the start of each provided
 /// buffer. This is a stable kernel ABI (4 x u32).
 const RECVMSG_OUT_HDR: usize = 16;
@@ -96,8 +92,7 @@ impl UringMultiRecv {
         let ring = IoUring::builder()
             .setup_single_issuer()
             .setup_coop_taskrun()
-            .setup_defer_taskrun()
-            .setup_cqsize(CQ_SIZE)
+            .setup_cqsize(FRAMEBUF_COUNT as u32)
             .build(sq_size)?;
 
         // Register socket file descriptors so the kernel can skip per-op fd lookup. SQEs then use
@@ -176,6 +171,10 @@ impl UringMultiRecv {
         let framebuf_base = self.framebuf_ring_ptr as *mut BufRingEntry;
         let mask = FRAMEBUF_COUNT - 1;
 
+        // Sockets whose multishot terminated but could not be resubmitted because the SQ was full.
+        // Retried at the top of each loop iteration after submit drains the SQ.
+        let mut pending_resubmit: Vec<usize> = Vec::new();
+
         // Template msghdr for RecvMsgMulti. The kernel uses msg_namelen and msg_controllen to
         // determine the layout within each provided buffer. Must remain at a stable address for
         // the lifetime of the multishot SQEs (i.e., until this function returns).
@@ -202,10 +201,19 @@ impl UringMultiRecv {
                 Err(e) => return Err(e),
             }
 
+            // Retry any multishot resubmissions that failed on a previous iteration because the
+            // SQ was full. The submit_with_args above drained the SQ, so there should be room now.
+            pending_resubmit.retain(|&idx| {
+                let entry = opcode::RecvMsgMulti::new(types::Fixed(idx as u32), &msghdr, BGID)
+                    .build()
+                    .user_data(idx as u64);
+                unsafe { self.ring.submission().push(&entry) }.is_err()
+            });
+
             // Drain CQEs into a stack buffer, then process. This avoids heap allocation while
             // releasing the borrow on the completion queue before we need to touch the submission
             // queue or buffer ring.
-            let mut cqe_buf = [(0u64, 0i32, 0u32); CQ_SIZE as usize];
+            let mut cqe_buf = [(0u64, 0i32, 0u32); FRAMEBUF_COUNT as usize];
             let mut cqe_count = 0;
             for cqe in self.ring.completion() {
                 cqe_buf[cqe_count] = (cqe.user_data(), cqe.result(), cqe.flags());
@@ -216,9 +224,12 @@ impl UringMultiRecv {
                 let idx = ud as usize;
 
                 if result < 0 {
-                    let err = std::io::Error::from_raw_os_error(-result);
-                    if err.raw_os_error() != Some(libc::ECANCELED) {
-                        return Err(err);
+                    let err_code = -result;
+                    // ECANCELED: normal shutdown (SQE cancelled).
+                    // ENOBUFS: provided buffer ring exhausted; multishot terminated. The
+                    // resubmission logic below will restart it once buffers are returned.
+                    if err_code != libc::ECANCELED && err_code != libc::ENOBUFS {
+                        return Err(std::io::Error::from_raw_os_error(err_code));
                     }
                 } else if let Some(buf_id) = cqueue::buffer_select(flags) {
                     let buf_offset = buf_id as usize * BUF_ENTRY_SIZE;
@@ -244,7 +255,9 @@ impl UringMultiRecv {
                     let entry = opcode::RecvMsgMulti::new(types::Fixed(idx as u32), &msghdr, BGID)
                         .build()
                         .user_data(ud);
-                    unsafe { self.ring.submission().push(&entry) }.ok();
+                    if unsafe { self.ring.submission().push(&entry) }.is_err() {
+                        pending_resubmit.push(idx);
+                    }
                 }
             }
         }
diff --git a/docs/design/01-candumpr-ux.md b/docs/design/01-candumpr-ux.md
deleted file mode 100644
index db1e937..0000000
--- a/docs/design/01-candumpr-ux.md
+++ /dev/null
@@ -1,386 +0,0 @@
-# candumpr UX
-
-## Status
-
-**DRAFT**
-
-## Scope
-
-This document defines the user-facing features, CLI interface, and configuration file format for
-candumpr, a CAN bus logging tool. It does not cover internal implementation details.
-
-candumpr is an opinionated replacement for can-utils `candump`, focused on J1939 networks. It
-prioritizes performance and multi-network support at the cost of broader CAN compatibility.
-
-A primary design goal is lossless capture: candumpr should never drop a CAN frame under normal
-operating conditions, including during log file rotation. Every frame that the kernel delivers to
-the socket should appear in the output.
-
-An additional convenience is to optionally send a J1939 address claim PGN request to ensure that the
-CAN logs include address claims for every control function near the beginning of every log.
-
-## Features
-
-### Frame support
-
-* Only supports CAN with 29-bit extended (J1939) identifiers.
-* CAN FD and CAN XL are not supported.
-* Error frames are supported and logged alongside data frames.
-
-### Multi-interface logging
-
-* Supports logging from an arbitrary number of CAN interfaces simultaneously.
-* Each interface can be independently configured with its own filters and settings.
-* Interfaces can be specified on the CLI, in a TOML config file, or both.
-
-### Filtering
-
-Two filtering mechanisms are supported. Both can be used together.
-
-**candump-compatible mask filters** are specified per-interface using the same syntax as candump:
-
-* `id:mask` -- positive match (accept when `received_id & mask == id & mask`)
-* `id~mask` -- inverse match (accept when `received_id & mask != id & mask`)
-* `#error_mask` -- error frame class filter (see `linux/can/error.h`)
-
-All values are hexadecimal. Multiple filters are comma-separated after the interface name. Appending
-`j` or `J` to the filter list switches that interface from OR to AND semantics (same as candump).
-
-**Convenience filters** provide a more ergonomic way to filter J1939 traffic. These are specified in
-the TOML config file:
-
-* Filter by PGN (Parameter Group Number)
-* Filter by source address
-* Future work: filter by ISONAME + mask
-* Toggle error frame capture on or off
-
-Convenience filters are compiled to socket-level `id:mask` filters internally.
-
-When no filters are specified, all traffic is accepted.
-
-#### Filter combination semantics
-
-When multiple filters are specified on the same interface (whether candump-style masks, convenience
-filters, or both), they are combined with OR by default: a frame is accepted if it matches any
-filter.
-
-To switch to AND semantics (a frame must match all filters):
-
-* On the CLI, append `j` to the candump-style filter list (e.g., `can0,...,j`)
-* In the TOML config, set `filter_join = "and"` on the interface or in `[defaults]`
-
-Both map to the `CAN_RAW_JOIN_FILTERS` socket option.
-
-### Output formats
-
-candumpr supports multiple output formats, configurable per-interface:
-
-* **candump** (`.log`) -- default -- the can-utils `candump -L` log file format:
-  `(1345212884.318850) can0 18FECA00#0011223344556677`
-* **candump-tty** (`.log`) -- the can-utils `candump` console format:
-  `can0  18FECA00   [8]  00 11 22 33 44 55 66 77`
-* **ASC** (`.asc`) -- Vector ASCII logging format, compatible with CANalyzer/CANoe and other tools
-  that import ASC files.
-* **PCAP** (`.pcap`) -- packet capture format, compatible with Wireshark and tcpdump.
-
-When compressed, an additional `.zst` suffix is appended (e.g., `.log.zst`, `.asc.zst`).
-
-### Timestamps
-
-Timestamp mode controls how frame timestamps are displayed in candump and candump-tty output
-formats. ASC and PCAP use their native timestamp conventions and ignore this setting.
-
-* **absolute** -- seconds since epoch with fractional seconds
-* **delta** -- time elapsed since the previous received frame
-* **zero** -- time elapsed since the first received frame
-
-Hardware timestamps from the CAN controller are used automatically when available, falling back to
-kernel software timestamps with a diagnostic warning. This requires no configuration.
-
-### Clock correctness
-
-candumpr is designed to start early in the boot cycle on IoT devices that may lack a persistent RTC.
-On these devices, `CLOCK_REALTIME` can be invalid (near epoch) until NTP or another time source
-synchronizes it.
-
-candumpr will provide options to control how it detects an invalid clock and what it does with
-frames captured before the clock becomes valid. Detection methods include a heuristic (is the clock
-before a reasonable threshold?) and waiting for a clock step event. Behaviors may include dropping
-frames, queueing them in memory, using zero-based timestamps, inserting a marker, or rotating the
-log file when the clock becomes valid. The available behaviors may depend on the output format.
-
-One strategy for clock correctness is to give each log file a strictly monotonic incrementing index.
-Then at least you can tell the order of the files. candumpr should also attempt to detect and log
-clock jumps to stderr so that they're less surprising if you have to reverse engineer what the clock
-did by looking at strictly just the logs.
-
-This feature requires dedicated detailed design and is not fully specified here.
-
-### File logging and rotation
-
-When logging to files, each monitored interface writes to its own log file. This applies even when
-using the `any` interface binding; frames are separated by their source interface, and `{interface}`
-resolves to the actual interface name (e.g., `can0`), not `any`.
-
-* Log filenames are controlled by a format string with placeholders:
-  * `{interface}` -- the source interface name (e.g., `can0`)
-  * `{start-unix}` -- Unix seconds when the log file was opened (e.g., `1741868400`)
-  * `{start-iso}` -- ISO 8601 timestamp when the log file was opened, without colons (e.g.,
-    `2026-03-13T120000Z`), since colons break rsync and some filesystems.
-  * Default format: `candumpr-{interface}-{start-unix}` (plus the appropriate file extension).
-* The log directory path supports the same `{interface}` placeholder, allowing per-interface
-  directory organization (e.g., `/var/log/candumpr/{interface}/`).
-* If the resolved file path (directory + name + extension) would be identical for two or more
-  interfaces, candumpr exits with a configuration error. Disambiguation can be achieved by including
-  `{interface}` in the filename or directory path, or by setting different `log_dir` values
-  per-interface.
-* File rotation can be triggered by:
-  * A time interval (e.g., `1h`, `30m`)
-  * A file size threshold (e.g., `50MB`, `1GB`)
-  * The value is unambiguous: size units (`B`, `KB`, `MB`, `GB`) and time units (`s`, `m`, `h`, `d`)
-    do not overlap. Bare integers without a unit suffix are rejected.
-  * SIGHUP is always available for manual rotation regardless of the configured method.
-* During rotation, no frames are lost. Buffered frames are flushed to the old file before the new
-  file begins.
-* Completed log files are never partially written. Files are written to a temporary name and renamed
-  atomically on completion.
-* ZSTD streaming compression is optionally applied during writing.
-* Buffered output is flushed to disk periodically (configurable interval) to limit data loss on
-  unexpected power loss or crash.
-
-When not logging to files, output goes to stdout.
-
-### Log retention
-
-When logging to files, candumpr can automatically prune old log files to prevent unbounded disk
-usage.
-
-* **max_total_size** -- maximum total size of all completed log files across all interfaces (e.g.,
-  `10GB`). When exceeded, the oldest completed log files are deleted regardless of which interface
-  produced them. Retention is checked after each log rotation.
-
-### J1939 address claim
-
-On startup and after each log rotation, candumpr can optionally broadcast a J1939 Address Claim PGN
-request. This causes all devices on the bus to re-announce their addresses, ensuring each log file
-contains a complete picture of which source addresses are in use.
-
-### Statistics
-
-Per-interface statistics counters are maintained and can be reported:
-
-* Frame count (total and per-second)
-* Byte count and estimated bitrate
-* Dropped frame count (frames lost due to socket buffer overflow)
-
-Dropped frame monitoring is always enabled.
-
-### Socket configuration
-
-* The socket receive buffer size can be configured per-interface. The tool attempts `SO_RCVBUFFORCE`
-  first (requires `CAP_NET_ADMIN`) and falls back to `SO_RCVBUF`.
-
-### Device resilience
-
-* If a monitored CAN interface goes down, candumpr continues running and resumes logging when the
-  interface comes back up. This is the default and only behavior (unlike candump, which exits by
-  default).
-
-### Signal handling
-
-* **SIGHUP** -- trigger log file rotation
-* **SIGTERM / SIGINT** -- graceful shutdown (flush buffers, finalize current log file)
-
-### Diagnostic logging
-
-Operational events are logged to stderr via `tracing`:
-
-* Dropped frames (socket buffer overflow)
-* Bus-off state changes and recovery
-* Network interface up/down events
-* Startup and shutdown status
-* Log file rotation events
-
-This keeps CAN data output (stdout or log files) clean, while ensuring operational issues are
-visible. The log level can be set via `--log-level` on the CLI, `log_level` in the TOML config, or
-the `CANDUMPR_LOG` environment variable (in `EnvFilter` format). The environment variable takes
-precedence when set.
-
-### Display options (stdout only)
-
-When outputting to a TTY:
-
-* Color mode (`--color`):
-  * `never` -- no color or styling
-  * `network` -- each interface gets a distinct color applied to the entire line, to visually
-    distinguish traffic from different networks
-  * `highlight` -- use color and weight to improve readability: the interface name and timestamp are
-    colored, and data bytes alternate between bold and normal weight to make it easier to visually
-    parse byte boundaries
-* TX/RX direction is always shown for each frame.
-
-## CLI interface
-
-```
-candumpr [OPTIONS] [INTERFACE[,FILTER]...]
-```
-
-### Positional arguments
-
-Interfaces are specified as positional arguments, optionally followed by comma-separated
-candump-compatible filters. The special name `any` receives from all CAN interfaces (same as
-candump):
-
-```sh
-# Listen on all CAN interfaces that are up
-candumpr any
-
-# No filters (accept all traffic on both interfaces)
-candumpr can0 can1
-
-# candump-compatible mask filters
-candumpr can0,18FECA00:1FFFFFFF can1,18FEE500:1FFFFFFF
-
-# Inverse match
-candumpr can0,18FECA00~1FFFFFFF
-
-# Error frame filter
-candumpr can0,#FFFFFFFF
-
-# Join filters with AND semantics (must match all)
-candumpr can0,18FECA00:1FFF0000,00000017:000000FF,j
-```
-
-### Options
-
-#### Configuration
-
-| Flag                  | Description                                  |
-| --------------------- | -------------------------------------------- |
-| `-C, --config <path>` | Path to a TOML configuration file            |
-| `--log-level <level>` | Diagnostic log level (e.g., `info`, `debug`) |
-
-CLI flags apply globally to every interface. Per-interface configuration, filtering, file logging
-options (directory, naming, rotation, compression, retention), and socket tuning require a TOML
-config file. Interfaces specified on the CLI are merged with interfaces in the config file.
-
-#### Output format
-
-| Flag                     | Description                                                        |
-| ------------------------ | ------------------------------------------------------------------ |
-| `-f, --format <fmt>`     | Output format: `candump`, `candump-tty`, `asc`, `pcap`             |
-| `-t, --timestamp <type>` | Timestamp mode: `absolute`, `delta`, `zero` (candump formats only) |
-| `-c, --color <mode>`     | Color mode: `never`, `network`, `highlight`                        |
-
-#### File logging
-
-| Flag | Description                                             |
-| ---- | ------------------------------------------------------- |
-| `-l` | Log to files in the current directory (default: stdout) |
-
-#### J1939
-
-| Flag                    | Description                                              |
-| ----------------------- | -------------------------------------------------------- |
-| `-A`, `--address-claim` | Send address claim request on startup and after rotation |
-
-#### Termination
-
-| Flag                 | Description                                              |
-| -------------------- | -------------------------------------------------------- |
-| `-n, --count <n>`    | Exit after receiving n frames                            |
-| `-T, --timeout <ms>` | Exit if no frames received within this many milliseconds |
-
-## TOML configuration file
-
-The `[defaults]` section provides default values for all interface settings. Individual
-`[interfaces.<name>]` sections can override any default. All fields are optional at every level.
-
-```toml
-log_level = "info" # diagnostic log level for stderr output
-# All logs together must stay below this limit
-max_total_size = "10GB"
-
-[defaults]
-# Output
-format = "candump" # "candump" | "candump-tty" | "asc" | "pcap"
-timestamp = "absolute" # "absolute" | "delta" | "zero" (candump formats only)
-color = "highlight" # "never" | "network" | "highlight"
-
-# File logging
-log_dir = "/var/log/candumpr" # supports {interface} placeholder
-log_name = "candumpr-{interface}-{start-unix}" # placeholders: {interface}, {start-unix}, {start-iso}
-rotate = "1h" # time or size based rotation
-compress = "none" # "zstd" | "none"
-zstd_level = 3
-flush_interval = "5s"
-
-# Filtering
-error_frames = true
-pgns = []
-source_addresses = []
-filter_join = "or" # "and" | "or"
-
-# Socket
-recv_buffer = "2MB"
-
-# J1939
-address_claim = true
-
-# --- Per-interface overrides ---
-
-# Inherits all [defaults], overrides nothing:
-[interfaces.can0]
-# Overrides specific settings:
-[interfaces.can1]
-error_frames = false
-pgns = [0xFECA, 0xFEE5]
-
-[interfaces.can2]
-address_claim = false
-source_addresses = [0x00, 0x17]
-log_dir = "/var/log/candumpr/can2"
-
-# candump-compatible raw filters:
-[interfaces.can3]
-filters = ["18FECA00:1FFFFFFF", "18FEE500~1FFFF00"]
-
-# AND semantics for all filters on this interface:
-[interfaces.can4]
-pgns = [0xFECA]
-source_addresses = [0x17]
-filter_join = "and"
-```
-
-### Precedence
-
-Settings are resolved in this order, highest priority first:
-
-1. CLI flags
-2. TOML `[interfaces.<name>]`
-3. TOML `[defaults]`
-4. Built-in defaults
-
-For settings available on the CLI, CLI flags apply globally and override all other sources,
-including per-interface TOML settings. For example, `--format pcap` forces that format on every
-interface. Most settings are only available through the TOML config file.
-
-List-valued options (`pgns`, `source_addresses`, `filters`) are replaced wholesale at each
-precedence level, not merged. For example, if `[defaults]` sets `pgns = [0xFECA, 0xFEE5]` and
-`[interfaces.can0]` sets `pgns = [0xFECA]`, then `can0` uses only `[0xFECA]`.
-
-### Interface discovery
-
-Interfaces to monitor are the union of:
-
-* Interfaces named on the CLI
-* Interfaces listed in `[interfaces]` in the config file
-
-The special name `any` is specified on the CLI only (`candumpr any`). It binds to all CAN
-interfaces, including interfaces that come up after candumpr has started. Using `any` and named
-interfaces together is a configuration error, since the `any` binding would duplicate frames from
-explicitly-bound interfaces. When using `any`, settings come from `[defaults]` (and CLI flags).
-
-Even when using `any`, log files are written per source interface (not a single combined file).
-
-At least one interface must be specified.
diff --git a/docs/design/01-testing-strategy.md b/docs/design/01-testing-strategy.md
new file mode 100644
index 0000000..61f3462
--- /dev/null
+++ b/docs/design/01-testing-strategy.md
@@ -0,0 +1,70 @@
+# Testing strategy
+
+## Status
+
+**IMPLEMENTED**
+
+## Scope
+
+This document describes the mechanisms available for writing tests around the utilities in this
+project that depend on Linux socketcan interfaces that require either real hardware or elevated
+permissions to create.
+
+## Problem
+
+These utilities interact directly with CAN sockets. Testing requires CAN interfaces, but:
+
+* Real CAN hardware is not available in CI.
+* Virtual CAN (vcan) interfaces require `CAP_NET_ADMIN` to create.
+* vcan interfaces are system-global resources, so parallel tests using shared interfaces cause
+  interference.
+* Tests must run in CI (GitHub Actions) and locally without requiring root.
+
+## Solution: user + network namespaces
+
+Each test process enters its own namespace using `unshare(CLONE_NEWUSER | CLONE_NEWNET)`. Inside the
+namespace, the process has `CAP_NET_ADMIN` without real root privileges, vcan interfaces are private
+and isolated, and everything is cleaned up when the process exits. See the
+[vcan-fixture](/vcan-fixture/src/lib.rs) crate for the implementation.
+
+Depending on your system (Fedora 42 doesn't need the following, but Ubuntu 24.04 does), you may need
+to disable the following apparmor setting:
+
+```sh
+sudo sysctl -w kernel.apparmor_restrict_unprivileged_userns=0
+```
+
+Constraint: `unshare(CLONE_NEWUSER)` requires a single-threaded process. The Rust test harness is
+multi-threaded, so namespace entry needs to happen in a `ctor` constructor before `main()`.
+
+```rust
+#[ctor::ctor]
+fn setup() {
+    tracing_subscriber::fmt()
+        .with_test_writer()
+        .init();
+    vcan_fixture::enter_namespace();
+}
+```
+
+## CI
+
+Tests that require vcan use `#[cfg_attr(feature = "ci", ignore = "requires vcan")]`. In CI,
+`--all-features` enables the `ci` feature, making them `#[ignore]`. They are then run as a separate
+step gated on whether vcan setup succeeded.
+
+A separate canary job (`vcan-available`) with `continue-on-error: true` fails with a warning status
+when the vcan module is unavailable on the runner, rather than silently skipping the tests. This
+makes it visible in the PR workflow status when vcan isn't available, but doesn't prevent merging
+for infrastructure reasons outside of my control (I've ready about `linux-modules-extra` not always
+matching the runner kernel version).
+
+See [lint.yml](/.github/workflows/lint.yml) for the implementation.
+
+## Benchmarking
+
+There are additional utilities in the `vcan_fixture::bench` module for
+
+* Querying current thread and process resource usage
+* Pinning the current process to N CPU cores
+* Starting a PWM-like busyloop thread to approximate P% CPU usage over N threads
diff --git a/docs/design/02-candumpr-filters.md b/docs/design/02-candumpr-filters.md
deleted file mode 100644
index 264cafe..0000000
--- a/docs/design/02-candumpr-filters.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# candumpr filter syntax and semantics
-
-## Status
-
-**TODO**
-
-## Scope
-
-This document specifies the filter syntax and semantics for candumpr, covering:
-
-* candump-compatible `id:mask` and `id~mask` filter syntax
-* Error frame class filters (`#error_mask`)
-* Convenience filters (PGN, source address)
-* How convenience filters compile to kernel-level `CAN_RAW_FILTER` entries
-* Filter combination semantics (OR vs AND, `CAN_RAW_JOIN_FILTERS`)
-* Interaction between candump-style and convenience filters on the same interface
diff --git a/docs/design/03-candumpr-clock-correctness.md b/docs/design/03-candumpr-clock-correctness.md
deleted file mode 100644
index 1144fb4..0000000
--- a/docs/design/03-candumpr-clock-correctness.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# candumpr clock correctness
-
-## Status
-
-**TODO**
-
-## Scope
-
-This document specifies how candumpr handles unreliable system clocks, covering:
-
-* Detection of an invalid `CLOCK_REALTIME` (heuristic threshold, clock step events)
-* Behavior for frames captured before the clock becomes valid
-* Monotonic file indexing to preserve ordering independent of wall clock
-* Clock jump detection and diagnostic logging
-* Interaction with log file rotation and timestamps
-* Interaction with output formats (candump, ASC, PCAP) that embed timestamps
diff --git a/docs/design/04-candumpr-architecture.md b/docs/design/04-candumpr-architecture.md
deleted file mode 100644
index da4209c..0000000
--- a/docs/design/04-candumpr-architecture.md
+++ /dev/null
@@ -1,363 +0,0 @@
-# candumpr architecture
-
-## Status
-
-**TODO**
-
-## Scope
-
-This document specifies the internal architecture of candumpr, covering the threading model, I/O
-strategy, and the mechanisms used to achieve lossless capture. It does not cover user-facing
-features or CLI/config design (see [01-candumpr-ux](01-candumpr-ux.md)).
-
-## Target environment
-
-A modern-ish Linux with io_uring and socketcan available. A ~4 core ~1 GHz arm64 CPU with 1 GB
-memory and 4+ J1939 CAN networks.
-
-## Design goal: never drop a frame
-
-TODO: Define what "never drop" means precisely. Kernel socket buffer overflow is the primary
-mechanism for frame loss. Describe the end-to-end path from kernel socket buffer to flushed bytes on
-disk, and identify every point where frames could be lost or delayed.
-
-## Design goal: lowest system impact
-
-TODO: I'm targeting using candumpr to log traffic from 4 500kbaud CAN networks on a 4 core system
-responsible for other application concerns. The logging is not the purpose of the system, it's a
-troubleshooting enabler. I'm after a solution with the minimal system performance impact.
-
-## Design goal: long-running CAN logging daemon to facilitate troubleshooting
-
-I want a long-running daemon to log all CAN traffic to facilitate future field issues. That means:
-
-* Address claim PGN requests upon startup and rotation
-* Log rotation policy
-* Log retention policy
-* Configuration
-* Usability with other tools (e.g., .pcap files with Wireshark)
-
-## Option 1: dedicated thread pairs
-
-One recv thread and one write thread per interface. The recv thread reads frames from the socket and
-passes them to its paired write thread over a channel. The write thread handles formatting,
-compression, and file I/O.
-
-TODO: Describe how io_uring fits in (recv side, write side, or both). Describe the channel type and
-backpressure strategy. Describe how log rotation and SIGHUP are coordinated between the two threads.
-
-## Option 2: shared threads
-
-A small number of shared recv threads and shared write threads, rather than a dedicated pair per
-interface. This may be a better fit for the target environment of 4 ARM cores with 4+ interfaces,
-where dedicating 2 threads per interface would oversubscribe the CPU.
-
-TODO: Describe the multiplexing strategy (io_uring multishot recv, epoll, etc.). Describe how write
-work is distributed. Describe how this interacts with per-interface file handles, rotation, and
-compression state.
-
-## Back-of-the-napkin math
-
-### Frame rate
-
-A CAN 2.0B extended frame (29-bit ID) with an 8-byte payload uses the following bits on the wire,
-assuming zero bitstuffing:
-
-| Field          |    Bits |
-| -------------- | ------: |
-| SOF            |       1 |
-| Base ID        |      11 |
-| SRR            |       1 |
-| IDE            |       1 |
-| Extended ID    |      18 |
-| RTR            |       1 |
-| r1 (reserved)  |       1 |
-| r0 (reserved)  |       1 |
-| DLC            |       4 |
-| Data (8 bytes) |      64 |
-| CRC            |      15 |
-| CRC delimiter  |       1 |
-| ACK slot       |       1 |
-| ACK delimiter  |       1 |
-| EOF            |       7 |
-| IFS            |       3 |
-| **Total**      | **131** |
-
-Reference: linux-can/can-utils
-[canframelen.c](https://github.com/linux-can/can-utils/blob/master/canframelen.c) computes
-`(eff ? 67 : 47) + len * 8` for the no-bitstuffing case (CFL_NO_BITSTUFFING). The worst-case
-bitstuffing formula from
-[canframelen.h](https://github.com/linux-can/can-utils/blob/master/canframelen.h) is `80 + 10 * len`
-bits for extended frames. IFS (3 bits) is included in both formulas.
-
-Zero bitstuffing is the worst case for frame rate: fewer bits per frame means more frames per
-second. J1939 always uses 29-bit extended IDs and 8-byte payloads.
-
-At 500 kbaud with zero bitstuffing: 500,000 / 131 = **3816 frames/sec per bus**.
-
-| Scenario                        | Frames/sec |
-| ------------------------------- | ---------: |
-| 1 bus, no bitstuffing           |      3,816 |
-| 1 bus, worst-case bitstuffing   |      3,125 |
-| 4 buses, no bitstuffing         |     15,264 |
-| 4 buses, worst-case bitstuffing |     12,500 |
-
-We'll proceed assuming 3,816 frames/sec, acknowledging that this would be 100% busload, which
-doesn't happen in practice.
-
-### Per-frame recv cost
-
-The recv path for a single frame (ignoring formatting and write):
-
-1. The kernel delivers the frame to the socket buffer via interrupt. This cost is the same across
-   all backends.
-2. The receiver wakes (or discovers a new CQE) and reads the frame.
-3. The on_frame callback runs.
-
-The distinguishing cost is step 2: how many syscalls the receiver makes per frame, and how much
-userspace work each backend does.
-
-| Approach                  | Syscalls/frame | Notes                                              |
-| ------------------------- | -------------: | -------------------------------------------------- |
-| Dedicated (blocking read) |              1 | One `read()` per frame per thread                  |
-| epoll + read              |            1-2 | `epoll_wait` + `read`; drain loop adds EAGAIN read |
-| epoll + recvmmsg          |             <1 | Batched reads reduce per-frame count               |
-| io_uring single-shot      |             ~1 | `submit_and_wait` both submits and collects        |
-| io_uring multishot        |             <1 | Multiple CQEs per `submit_and_wait`; no resubmit   |
-
-For the multiplexed backends, frames arriving on multiple sockets within the same
-`epoll_wait`/`submit_and_wait` window are serviced in a single wakeup. At 3816 fps per bus across 4
-buses, the mean inter-arrival across all sockets is ~65us, so overlapping arrivals are common.
-
-On a 1 GHz ARM64 processor, a syscall round trip (userspace to kernel and back) takes roughly 2-5us
-depending on kernel mitigations and the specific operation. Using 3us as a rough estimate:
-
-| Approach           | Est. recv CPU (4 buses, 3816 fps each) | Threads |
-| ------------------ | -------------------------------------: | ------: |
-| Dedicated          |                        ~6% of one core |       4 |
-| epoll + read       |                      ~4-6% of one core |       1 |
-| io_uring multishot |                        ~2% of one core |       1 |
-
-These cover only recv syscall overhead, excluding on_frame processing, scheduling, and cache
-effects.
-
-### Context switches
-
-Each sleep/wake cycle is a voluntary context switch. The scheduling cost itself is small (~1-5 us),
-but each wakeup pollutes L1/L2 caches, affecting co-resident applications.
-
-| Approach                             | Threads | Est. context switches/sec (4 buses) |
-| ------------------------------------ | ------: | ----------------------------------: |
-| Dedicated                            |       4 |                        Up to 15,264 |
-| Multiplexed (one frame per wakeup)   |       1 |                        Up to 15,264 |
-| Multiplexed (multiple frames/wakeup) |       1 |   Fewer; depends on arrival overlap |
-
-The dedicated backend also incurs involuntary context switches when its recv threads compete with
-application threads for cores.
-
-### Plausible bottlenecks
-
-1. **Context switches and cache pollution.** At worst-case rates, the receiver wakes up to ~15,000
-   times/sec. Each wakeup pollutes L1/L2 caches, affecting co-resident applications. This is likely
-   the dominant source of system impact, since the raw CPU cost for recv is small (2-6% of one
-   core).
-
-2. **Socket buffer overflow.** The CAN_RAW receive buffer capacity depends on `rmem_max` and sk_buff
-   overhead; on a typical system it may hold only a few hundred frames. At 3816 fps, even a buffer
-   of 200 frames fills in ~52 ms if the receiver stalls. Any write-path backpressure lasting longer
-   causes frame loss.
-
-3. **Thread oversubscription (dedicated only).** With 4 recv threads on a 4-core system, the recv
-   threads alone use all available cores before accounting for write threads or other application
-   work. Involuntary preemption increases and cache efficiency drops.
-
-4. **Write path stalls (out of scope for recv benchmarks).** The recv path must drain the socket
-   buffer faster than frames arrive, even when the write path stalls for disk I/O or log rotation.
-   The socket buffer depth sets the maximum tolerable stall.
-
-   candumpr will use a dedicated write thread and another larger and growable frame queue to
-   mitigate this. This is an important choice, because we can't arbitrarily grow the recvbuf,
-   there's a maximum limit.
-
-## Benchmarking strategy
-
-### Goals
-
-The recv benchmarks answer three questions:
-
-1. **Multiplexed vs. dedicated.** Does a single-threaded multiplexed receiver have lower system
-   impact than one-thread-per-socket on a 4-core system with 4 CAN interfaces?
-
-2. **Which multiplexed backend.** Among epoll + read, epoll + recvmmsg, io_uring single-shot, and
-   io_uring multishot with provided buffers: which has the lowest per-frame overhead and the fewest
-   context switches?
-
-3. **Where to optimize.** What is the per-frame instruction cost, and where are the hot spots?
-
-### Test environment
-
-All benchmarks run on vcan interfaces inside an isolated user + network namespace (via
-`unshare(2)`). This eliminates the need for root or hardware CAN interfaces.
-
-vcan delivers frames as fast as possible synchronously through the kernel's loopback path, with no
-bus timing or contention. This is appropriate for measuring recv path overhead in isolation. Results
-should be validated on the target hardware before making final decisions.
-
-### Benchmark A: per-frame instruction cost
-
-**Purpose.** Measure userspace instruction overhead per frame for each backend. Identify which code
-paths dominate the per-frame cost and where optimization effort should focus.
-
-**Method.** Callgrind-based profiling via gungraun.
-
-**Setup.**
-
-1. Create 4 vcan interfaces.
-2. Open one TX and one RX socket per interface.
-3. Pre-send frames into the RX socket buffers. The number per interface is limited by the kernel
-   socket receive buffer (constrained by `rmem_max`, which cannot be raised inside a user
-   namespace). The benchmark should determine the usable capacity at runtime and fill to that limit.
-4. Start the profiled region.
-5. Run the backend to drain all frames.
-6. End the profiled region.
-
-Pre-filling rather than concurrent sending ensures that the profiled region contains only recv work
-and that the send cost is identical across backends.
-
-**Callback fairness.** The on_frame callback must have identical cost across all backends. The
-dedicated backend runs multiple threads, so its counting mechanism needs to be thread-safe. Each
-dedicated recv thread should count frames in a thread-local variable (not a shared atomic) to avoid
-penalizing it with synchronization overhead that belongs to the test harness, not the backend. The
-single-threaded backends should use the same local-variable approach.
-
-**Metrics.**
-
-* Instructions per frame
-* L1 data cache miss rate
-* Branch misprediction rate
-
-**Limitations.** Callgrind counts userspace instructions only. For io_uring backends, kernel-side
-CQE processing and buffer ring management are not captured. Treat io_uring instruction counts as a
-lower bound that excludes kernel work.
-
-### Benchmark B: steady-state system impact
-
-**Purpose.** Measure the receiver's CPU time and scheduling overhead under sustained load at
-realistic CAN frame rates.
-
-**Method.** Concurrent senders and receiver. Collect per-thread resource usage for the receiver
-only.
-
-**Setup.**
-
-1. Create vcan interfaces in an isolated namespace.
-2. Spawn one sender thread per interface. Senders pace frames at the target rate using sleep-based
-   timing (`clock_nanosleep` with `TIMER_ABSTIME`). Do not use busy-spin pacing; it burns CPU and
-   contaminates resource measurements.
-3. Run the backend under test on the receiver thread.
-4. Collect resource usage via `getrusage(RUSAGE_THREAD)` on the receiver thread before and after the
-   run. For the dedicated backend, collect `RUSAGE_THREAD` from each sub-thread and aggregate.
-5. A timer thread stops all threads after the run duration.
-
-**Metric isolation.** Using `RUSAGE_THREAD` rather than `RUSAGE_SELF` excludes sender threads, the
-timer thread, and all other process-level overhead from the measurements. At realistic frame rates,
-the receiver's CPU contribution is small and would be invisible in a process-wide measurement.
-
-**Test matrix.**
-
-| Parameter            | Values                                         |
-| -------------------- | ---------------------------------------------- |
-| Backends             | dedicated, epoll, recvmmsg, uring, uring_multi |
-| Interfaces           | 1, 2, 4                                        |
-| Rate (per interface) | 1000 fps, 2000 fps, 4000 fps                   |
-| Duration             | 8 seconds                                      |
-| Repetitions          | 4, report median by receiver kernel time       |
-| Core constraint      | Use `taskset -c 0-3` to limit to 4 cores       |
-
-5 backends x 3 interface counts x 3 rates = 45 configurations. At 4 repetitions and 8 seconds per
-run, a full sweep takes roughly 24 minutes.
-
-**Metrics (per run).**
-
-| Metric                   | Source                    | Purpose                        |
-| ------------------------ | ------------------------- | ------------------------------ |
-| Receiver user CPU (ms)   | RUSAGE_THREAD `ru_utime`  | Userspace processing cost      |
-| Receiver kernel CPU (ms) | RUSAGE_THREAD `ru_stime`  | Kernel time for recv syscalls  |
-| Receiver voluntary csw   | RUSAGE_THREAD `ru_nvcsw`  | Sleep/wake frequency           |
-| Receiver involuntary csw | RUSAGE_THREAD `ru_nivcsw` | Preemption frequency           |
-| Frames sent              | Sender counter            | Confirms rate pacing accuracy  |
-| Frames received          | Receiver counter          | Confirms lossless capture      |
-| Frame loss %             | (sent - recv) / sent      | Must be 0% at all tested rates |
-
-**Rate pacing accuracy.** At 4000 fps per interface, the inter-frame interval is 250us.
-`clock_nanosleep` with absolute timestamps should achieve this within a few microseconds of jitter.
-Verify that the actual sent count matches the expected count (rate x duration) within 1%.
-
-### Benchmark C: recv under CPU contention
-
-**Purpose.** Determine which backend is most resilient to frame loss when the system is under CPU
-pressure from other workloads. candumpr is an ancillary concern on the target system; the primary
-application may consume most of the available CPU, and the recv backend must survive this without
-dropping frames.
-
-**Method.** Run benchmark B's send/recv setup alongside a synthetic CPU load on the same cores.
-Measure frame loss at different contention levels.
-
-**Setup.**
-
-1. Create 4 vcan interfaces in an isolated namespace.
-2. Start a CPU load generator on the same cores as the benchmark. Use
-   `stress-ng --cpu 4 --cpu-load P --taskset 0-3` where P is the target load percentage. Each worker
-   duty-cycles between burning and sleeping to approximate P% utilization per core.
-3. Run the send/recv harness from benchmark B (sleep-paced senders, receiver, timer) on the same
-   cores.
-4. Collect the same per-thread metrics as benchmark B, plus frame loss.
-
-**Test matrix.**
-
-| Parameter       | Values                                         |
-| --------------- | ---------------------------------------------- |
-| Backends        | dedicated, epoll, recvmmsg, uring, uring_multi |
-| Interfaces      | 4                                              |
-| Rate            | 4000 fps per interface                         |
-| CPU contention  | 75%, 95%                                       |
-| Duration        | 8 seconds                                      |
-| Repetitions     | 4, report median by frame loss %               |
-| Core constraint | `taskset -c 0-3`                               |
-
-5 backends x 2 contention levels = 10 configurations. At 4 repetitions and 8 seconds per run, a full
-sweep takes roughly 6 minutes.
-
-**Metrics (per run).**
-
-| Metric                   | Source                    | Purpose                              |
-| ------------------------ | ------------------------- | ------------------------------------ |
-| Frame loss %             | (sent - recv) / sent      | Primary: resilience under contention |
-| Receiver user CPU (ms)   | RUSAGE_THREAD `ru_utime`  | How much CPU the receiver got        |
-| Receiver kernel CPU (ms) | RUSAGE_THREAD `ru_stime`  | Kernel time under contention         |
-| Receiver voluntary csw   | RUSAGE_THREAD `ru_nvcsw`  | Wakeup frequency under pressure      |
-| Receiver involuntary csw | RUSAGE_THREAD `ru_nivcsw` | How often the receiver was preempted |
-
-**What to look for.** At 75% contention, all backends should remain lossless (the receiver needs
-only 2-6% of one core). At 95%, some backends may start dropping frames. The interesting result is
-the relative degradation: a backend that degrades gradually (small loss %) is preferable to one that
-collapses suddenly (large loss %).
-
-### Caveats
-
-* **vcan is not a real CAN bus.** There is no bus arbitration, no propagation delay, no error
-  frames, no bitstuffing, and no hardware interrupt path. These benchmarks measure the software recv
-  overhead only.
-* **Callgrind and io_uring.** Instruction counts for io_uring backends undercount the true per-frame
-  cost because kernel-side ring processing is not instrumented.
-* **x86_64 vs. ARM64.** Benchmarks run on a development workstation. Syscall costs, cache sizes, and
-  branch predictor behavior differ on the target ARM64 platform. Use these results for relative
-  comparison between backends, not as absolute predictions.
-
-It's technically possible to measure syscalls per thread with `perf_event_open` to setup a counter
-for the `raw_syscalls:sys_enter` tracepoint using the `perf-event` crate, but this doesn't work well
-inside the unshare user namespace without additional orchestration externally.
-
-## Open questions
-
-TODO
diff --git a/docs/design/05-testing-strategy.md b/docs/design/05-testing-strategy.md
deleted file mode 100644
index 6d2f09b..0000000
--- a/docs/design/05-testing-strategy.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# Testing strategy
-
-## Status
-
-**DRAFT**
-
-## Scope
-
-This document specifies how candumpr (and other tools in this workspace) are tested, given that they
-depend on Linux socketcan interfaces that require either real hardware or elevated permissions to
-create.
-
-## Problem
-
-candumpr interacts directly with CAN sockets. Testing requires CAN interfaces, but:
-
-* Real CAN hardware is not available in CI.
-* Virtual CAN (vcan) interfaces require `CAP_NET_ADMIN` to create.
-* vcan interfaces are system-global resources, so parallel tests using shared interfaces cause
-  interference.
-* Tests must run in CI (GitHub Actions) and locally without requiring root.
-
-## Solution: user + network namespaces
-
-Each test process enters its own isolated Linux network namespace using
-`unshare(CLONE_NEWUSER | CLONE_NEWNET)`. Inside the namespace, the process has `CAP_NET_ADMIN`
-without real root privileges, vcan interfaces are private and isolated, and everything is cleaned up
-when the process exits. See the [vcan-fixture](../../vcan-fixture/) crate for the implementation.
-
-Constraint: `unshare(CLONE_NEWUSER)` requires a single-threaded process. The Rust test harness is
-multi-threaded, so namespace entry happens in a `ctor` constructor before `main()`.
-
-## Test tiers
-
-### Unit tests
-
-No sockets, no namespaces. Config parsing, filter compilation, output formatting, filename template
-expansion, duration/size parsing.
-
-### Integration tests
-
-Run inside user + network namespaces with vcan interfaces. Socket binding, filter application,
-multi-interface capture, file rotation, ZSTD streaming, address claim, device resilience.
-
-### End-to-end tests
-
-Run the actual binary inside a network namespace. Launch candumpr, send frames with cangenr, verify
-output files, signal handling, config file loading.
-
-## CI
-
-Tests that require vcan use `#[cfg_attr(feature = "ci", ignore = "requires vcan")]`. In CI,
-`--all-features` enables the `ci` feature, making them `#[ignore]`. They are then run as a separate
-step gated on whether vcan setup succeeded:
-
-A separate canary job (`vcan-available`) with `continue-on-error: true` shows yellow when the vcan
-module is unavailable on the runner, rather than silently skipping the tests.
-
-See [lint.yml](/.github/workflows/lint.yml) for the implementation.
-
-## Benchmarking
-
-Benchmarks compare candumpr against candump on 4 vcan interfaces with J1939 traffic.
-
-### Metrics
-
-* **Frame loss** (primary): frames sent vs. frames in output
-* **Throughput ceiling**: send rate at which frames start dropping
-* **CPU usage**: total CPU time (user + system)
-* **Memory usage**: peak RSS
-
-### Simulating the target environment
-
-The target is a ~4 core ~1 GHz ARM CPU. Use `taskset` to pin benchmarks to 4 cores:
-
-```sh
-taskset -c 0-3 cargo bench
-```
-
-Core count is the important variable for comparing architecture options (dedicated thread pairs vs.
-shared threads). Clock speed matters less for relative comparison. Final validation must happen on
-real target hardware.
-
-### Acceptance criteria
-
-candumpr must not drop frames at the realistic J1939 rate (2000 frames/s per interface, 8000
-frames/s aggregate). At higher rates, candumpr should drop fewer frames than candump.
diff --git a/docs/design/06-benchmarks.md b/docs/design/06-benchmarks.md
deleted file mode 100644
index d5b85f8..0000000
--- a/docs/design/06-benchmarks.md
+++ /dev/null
@@ -1,200 +0,0 @@
-# Benchmark results
-
-The [04-candumpr-architecture.md](/docs/design/04-candumpr-architecture.md) design document proposes
-three different benchmarks to compare receiver backends.
-
-# Benchmark A - pure CPU cost
-
-```
-recv_cost::recv_cost::dedicated run:setup_blocking()
-  Instructions:                      454678|N/A                  (*********)
-  L1 Hits:                           866202|N/A                  (*********)
-  LL Hits:                            10504|N/A                  (*********)
-  RAM Hits:                             175|N/A                  (*********)
-  Total read+write:                  876881|N/A                  (*********)
-  Estimated Cycles:                  924847|N/A                  (*********)
-recv_cost::recv_cost::epoll run:setup_nonblocking()
-  Instructions:                      519312|N/A                  (*********)
-  L1 Hits:                           960184|N/A                  (*********)
-  LL Hits:                            10182|N/A                  (*********)
-  RAM Hits:                              53|N/A                  (*********)
-  Total read+write:                  970419|N/A                  (*********)
-  Estimated Cycles:                 1012949|N/A                  (*********)
-  Comparison with dedicated run:setup_blocking()
-    Instructions:                      454678|519312               (-12.4461%) [-1.14215x]
-    L1 Hits:                           866202|960184               (-9.78792%) [-1.10850x]
-    LL Hits:                            10504|10182                (+3.16244%) [+1.03162x]
-    RAM Hits:                             175|53                   (+230.189%) [+3.30189x]
-    Total read+write:                  876881|970419               (-9.63893%) [-1.10667x]
-    Estimated Cycles:                  924847|1012949              (-8.69758%) [-1.09526x]
-recv_cost::recv_cost::recvmmsg run:setup_nonblocking()
-  Instructions:                      468571|N/A                  (*********)
-  L1 Hits:                           882905|N/A                  (*********)
-  LL Hits:                            10191|N/A                  (*********)
-  RAM Hits:                              57|N/A                  (*********)
-  Total read+write:                  893153|N/A                  (*********)
-  Estimated Cycles:                  935855|N/A                  (*********)
-  Comparison with dedicated run:setup_blocking()
-    Instructions:                      454678|468571               (-2.96497%) [-1.03056x]
-    L1 Hits:                           866202|882905               (-1.89182%) [-1.01928x]
-    LL Hits:                            10504|10191                (+3.07134%) [+1.03071x]
-    RAM Hits:                             175|57                   (+207.018%) [+3.07018x]
-    Total read+write:                  876881|893153               (-1.82186%) [-1.01856x]
-    Estimated Cycles:                  924847|935855               (-1.17625%) [-1.01190x]
-  Comparison with epoll run:setup_nonblocking()
-    Instructions:                      519312|468571               (+10.8289%) [+1.10829x]
-    L1 Hits:                           960184|882905               (+8.75281%) [+1.08753x]
-    LL Hits:                            10182|10191                (-0.08831%) [-1.00088x]
-    RAM Hits:                              53|57                   (-7.01754%) [-1.07547x]
-    Total read+write:                  970419|893153               (+8.65093%) [+1.08651x]
-    Estimated Cycles:                 1012949|935855               (+8.23781%) [+1.08238x]
-recv_cost::recv_cost::uring run:setup_nonblocking()
-  Instructions:                      587770|N/A                  (*********)
-  L1 Hits:                          1071803|N/A                  (*********)
-  LL Hits:                            10210|N/A                  (*********)
-  RAM Hits:                             119|N/A                  (*********)
-  Total read+write:                 1082132|N/A                  (*********)
-  Estimated Cycles:                 1127018|N/A                  (*********)
-  Comparison with dedicated run:setup_blocking()
-    Instructions:                      454678|587770               (-22.6436%) [-1.29272x]
-    L1 Hits:                           866202|1071803              (-19.1827%) [-1.23736x]
-    LL Hits:                            10504|10210                (+2.87953%) [+1.02880x]
-    RAM Hits:                             175|119                  (+47.0588%) [+1.47059x]
-    Total read+write:                  876881|1082132              (-18.9673%) [-1.23407x]
-    Estimated Cycles:                  924847|1127018              (-17.9386%) [-1.21860x]
-  Comparison with epoll run:setup_nonblocking()
-    Instructions:                      519312|587770               (-11.6471%) [-1.13182x]
-    L1 Hits:                           960184|1071803              (-10.4141%) [-1.11625x]
-    LL Hits:                            10182|10210                (-0.27424%) [-1.00275x]
-    RAM Hits:                              53|119                  (-55.4622%) [-2.24528x]
-    Total read+write:                  970419|1082132              (-10.3234%) [-1.11512x]
-    Estimated Cycles:                 1012949|1127018              (-10.1213%) [-1.11261x]
-  Comparison with recvmmsg run:setup_nonblocking()
-    Instructions:                      468571|587770               (-20.2799%) [-1.25439x]
-    L1 Hits:                           882905|1071803              (-17.6243%) [-1.21395x]
-    LL Hits:                            10191|10210                (-0.18609%) [-1.00186x]
-    RAM Hits:                              57|119                  (-52.1008%) [-2.08772x]
-    Total read+write:                  893153|1082132              (-17.4636%) [-1.21159x]
-    Estimated Cycles:                  935855|1127018              (-16.9618%) [-1.20427x]
-recv_cost::recv_cost::uring_multi run:setup_nonblocking()
-  Instructions:                      628114|N/A                  (*********)
-  L1 Hits:                          1145140|N/A                  (*********)
-  LL Hits:                            11463|N/A                  (*********)
-  RAM Hits:                             168|N/A                  (*********)
-  Total read+write:                 1156771|N/A                  (*********)
-  Estimated Cycles:                 1208335|N/A                  (*********)
-  Comparison with dedicated run:setup_blocking()
-    Instructions:                      454678|628114               (-27.6122%) [-1.38145x]
-    L1 Hits:                           866202|1145140              (-24.3584%) [-1.32202x]
-    LL Hits:                            10504|11463                (-8.36605%) [-1.09130x]
-    RAM Hits:                             175|168                  (+4.16667%) [+1.04167x]
-    Total read+write:                  876881|1156771              (-24.1958%) [-1.31919x]
-    Estimated Cycles:                  924847|1208335              (-23.4610%) [-1.30652x]
-  Comparison with epoll run:setup_nonblocking()
-    Instructions:                      519312|628114               (-17.3220%) [-1.20951x]
-    L1 Hits:                           960184|1145140              (-16.1514%) [-1.19263x]
-    LL Hits:                            10182|11463                (-11.1751%) [-1.12581x]
-    RAM Hits:                              53|168                  (-68.4524%) [-3.16981x]
-    Total read+write:                  970419|1156771              (-16.1097%) [-1.19203x]
-    Estimated Cycles:                 1012949|1208335              (-16.1699%) [-1.19289x]
-  Comparison with recvmmsg run:setup_nonblocking()
-    Instructions:                      468571|628114               (-25.4003%) [-1.34049x]
-    L1 Hits:                           882905|1145140              (-22.8998%) [-1.29701x]
-    LL Hits:                            10191|11463                (-11.0966%) [-1.12482x]
-    RAM Hits:                              57|168                  (-66.0714%) [-2.94737x]
-    Total read+write:                  893153|1156771              (-22.7891%) [-1.29515x]
-    Estimated Cycles:                  935855|1208335              (-22.5500%) [-1.29116x]
-  Comparison with uring run:setup_nonblocking()
-    Instructions:                      587770|628114               (-6.42304%) [-1.06864x]
-    L1 Hits:                          1071803|1145140              (-6.40420%) [-1.06842x]
-    LL Hits:                            10210|11463                (-10.9308%) [-1.12272x]
-    RAM Hits:                             119|168                  (-29.1667%) [-1.41176x]
-    Total read+write:                 1082132|1156771              (-6.45236%) [-1.06897x]
-    Estimated Cycles:                 1127018|1208335              (-6.72967%) [-1.07215x]
-```
-
-# Benchmark B - system impact
-
-| backend     | ifaces | rate | sent  | recv  | lost | user_ms | sys_ms | vol_csw | invol_csw |
-| ----------- | ------ | ---- | ----- | ----- | ---- | ------- | ------ | ------- | --------- |
-| dedicated   | 1      | 1000 | 5000  | 5000  | 0    | 6.1     | 0.0    | 5000    | 0         |
-| dedicated   | 1      | 2000 | 10000 | 10000 | 0    | 11.7    | 0.0    | 10000   | 0         |
-| dedicated   | 1      | 4000 | 20000 | 20000 | 0    | 22.7    | 0.0    | 19996   | 0         |
-| dedicated   | 2      | 1000 | 10000 | 10000 | 0    | 12.2    | 0.0    | 10000   | 0         |
-| dedicated   | 2      | 2000 | 20000 | 20000 | 0    | 18.1    | 4.8    | 19999   | 0         |
-| dedicated   | 2      | 4000 | 40000 | 40000 | 0    | 44.4    | 0.0    | 39997   | 0         |
-| dedicated   | 4      | 1000 | 20000 | 20000 | 0    | 22.0    | 0.0    | 19999   | 0         |
-| dedicated   | 4      | 2000 | 40000 | 40000 | 0    | 34.6    | 7.8    | 39993   | 12        |
-| dedicated   | 4      | 4000 | 80000 | 80000 | 0    | 66.9    | 22.5   | 79956   | 48        |
-| epoll       | 1      | 1000 | 5000  | 5000  | 0    | 3.9     | 3.9    | 4999    | 0         |
-| epoll       | 1      | 2000 | 10000 | 10000 | 0    | 7.4     | 7.4    | 10000   | 0         |
-| epoll       | 1      | 4000 | 20000 | 20000 | 0    | 14.2    | 14.3   | 19999   | 0         |
-| epoll       | 2      | 1000 | 10000 | 9999  | 1    | 7.8     | 7.8    | 9865    | 0         |
-| epoll       | 2      | 2000 | 20000 | 19999 | 1    | 14.6    | 14.6   | 19871   | 1         |
-| epoll       | 2      | 4000 | 40000 | 39999 | 1    | 41.7    | 14.3   | 38664   | 1         |
-| epoll       | 4      | 1000 | 20000 | 19997 | 3    | 26.8    | 0.0    | 16407   | 1         |
-| epoll       | 4      | 2000 | 40000 | 39997 | 3    | 0.0     | 46.6   | 26749   | 62        |
-| epoll       | 4      | 4000 | 80000 | 79997 | 3    | 0.0     | 103.7  | 66257   | 18        |
-| recvmmsg    | 1      | 1000 | 5000  | 5000  | 0    | 0.0     | 7.9    | 5000    | 0         |
-| recvmmsg    | 1      | 2000 | 10000 | 10000 | 0    | 0.0     | 15.1   | 10000   | 0         |
-| recvmmsg    | 1      | 4000 | 20000 | 20000 | 0    | 0.0     | 28.7   | 19999   | 0         |
-| recvmmsg    | 2      | 1000 | 10000 | 9999  | 1    | 0.0     | 15.4   | 9896    | 0         |
-| recvmmsg    | 2      | 2000 | 20000 | 19999 | 1    | 0.0     | 29.6   | 19894   | 0         |
-| recvmmsg    | 2      | 4000 | 40000 | 39999 | 1    | 0.0     | 57.9   | 39893   | 0         |
-| recvmmsg    | 4      | 1000 | 20000 | 19997 | 3    | 0.0     | 26.2   | 15838   | 5         |
-| recvmmsg    | 4      | 2000 | 40000 | 39997 | 3    | 0.0     | 52.7   | 32025   | 7         |
-| recvmmsg    | 4      | 4000 | 80000 | 79997 | 3    | 0.0     | 101.0  | 63199   | 69        |
-| uring       | 1      | 1000 | 5000  | 5000  | 0    | 0.0     | 7.3    | 5048    | 0         |
-| uring       | 1      | 2000 | 10000 | 10000 | 0    | 0.0     | 14.0   | 10047   | 0         |
-| uring       | 1      | 4000 | 20000 | 20000 | 0    | 0.0     | 26.7   | 20046   | 1         |
-| uring       | 2      | 1000 | 10000 | 9999  | 1    | 0.0     | 14.2   | 9897    | 0         |
-| uring       | 2      | 2000 | 20000 | 19999 | 1    | 0.0     | 27.2   | 19924   | 0         |
-| uring       | 2      | 4000 | 40000 | 39999 | 1    | 7.6     | 44.8   | 39836   | 2         |
-| uring       | 4      | 1000 | 20000 | 19997 | 3    | 3.8     | 20.3   | 14763   | 10        |
-| uring       | 4      | 2000 | 40000 | 39997 | 3    | 8.1     | 42.2   | 33084   | 7         |
-| uring       | 4      | 4000 | 80000 | 79997 | 3    | 15.3    | 78.8   | 61615   | 43        |
-| uring_multi | 1      | 1000 | 5000  | 5000  | 0    | 1.0     | 6.1    | 5000    | 0         |
-| uring_multi | 1      | 2000 | 10000 | 10000 | 0    | 1.7     | 11.3   | 10000   | 0         |
-| uring_multi | 1      | 4000 | 20000 | 20000 | 0    | 3.9     | 21.1   | 19997   | 0         |
-| uring_multi | 2      | 1000 | 10000 | 10000 | 0    | 1.3     | 7.0    | 5000    | 0         |
-| uring_multi | 2      | 2000 | 20000 | 20000 | 0    | 2.5     | 13.4   | 9999    | 0         |
-| uring_multi | 2      | 4000 | 40000 | 40000 | 0    | 4.8     | 25.6   | 19996   | 0         |
-| uring_multi | 4      | 1000 | 20000 | 20000 | 0    | 1.8     | 9.4    | 5000    | 2         |
-| uring_multi | 4      | 2000 | 40000 | 40000 | 0    | 13.6    | 9.5    | 9995    | 22        |
-| uring_multi | 4      | 4000 | 80000 | 80000 | 0    | 35.1    | 8.6    | 19984   | 11        |
-
-# Benchmark C - system contention
-
-## 4 core ~75% utilization
-
-| backend     | ifaces | rate | sent  | recv  | lost | user_ms | sys_ms | vol_csw | invol_csw |
-| ----------- | ------ | ---- | ----- | ----- | ---- | ------- | ------ | ------- | --------- |
-| dedicated   | 4      | 4000 | 79991 | 79989 | 2    | 7.8     | 43.5   | 61858   | 169       |
-| epoll       | 4      | 4000 | 79997 | 79994 | 3    | 5.7     | 40.7   | 33651   | 327       |
-| recvmmsg    | 4      | 4000 | 79995 | 79994 | 1    | 6.3     | 40.3   | 34500   | 389       |
-| uring       | 4      | 4000 | 80000 | 79997 | 3    | 3.4     | 46.7   | 39036   | 284       |
-| uring_multi | 4      | 4000 | 79993 | 79992 | 1    | 4.4     | 20.7   | 11021   | 110       |
-
-## 4 core ~90% utilization
-
-| backend     | ifaces | rate | sent  | recv  | lost | user_ms | sys_ms | vol_csw | invol_csw |
-| ----------- | ------ | ---- | ----- | ----- | ---- | ------- | ------ | ------- | --------- |
-| dedicated   | 4      | 4000 | 79991 | 79991 | 0    | 8.1     | 27.0   | 56873   | 81        |
-| epoll       | 4      | 4000 | 79993 | 79991 | 2    | 9.9     | 31.7   | 40314   | 150       |
-| recvmmsg    | 4      | 4000 | 79993 | 79991 | 2    | 7.1     | 33.7   | 39261   | 184       |
-| uring       | 4      | 4000 | 80000 | 79991 | 9    | 7.9     | 29.2   | 37673   | 115       |
-| uring_multi | 4      | 4000 | 79993 | 79992 | 1    | 3.6     | 16.2   | 9232    | 64        |
-
-**NOTE:** Fewer involuntary context switches under higher CPU utilization is counter intuitive, but
-correct. It means the receiver is being starved rather than interrupted. Compare the sys_ms kernel
-CPU time.
-
-# Takeaways
-
-* The pure CPU cost of the receive backends don't matter hugely, because the dominant cost is the
-  syscalls and context switching
-* The multiplex methods are all pretty close to each other in terms of results
-* It appears all backends degrade nicely when the system is under high CPU load
-* It doesn't look like I'm going to get absolutely no dropped frames
-* Batching receives in the multishot backend dramatically reduces kernel CPU time and context
-  switches, moreso than the other multiplex backends, and even at high CPU load
diff --git a/docs/design/candumpr/01-goals.md b/docs/design/candumpr/01-goals.md
new file mode 100644
index 0000000..063a249
--- /dev/null
+++ b/docs/design/candumpr/01-goals.md
@@ -0,0 +1,69 @@
+# candumpr goals
+
+## Scope
+
+This document outlines the design goals and feature list of the candumpr utility.
+
+## Goals
+
+* A long-running logging daemon useful for troubleshooting events after-the-fact
+* Controls for disk usage
+* Controls for flash disk wear
+* Minimal system performance impact from logging multiple interfaces
+* Target low-spec Linux 6.1+ 4-core ~1GHz ARM CPUs with ~1GB memory
+* Logs are not corrupted on power loss
+* Frame drops due to the socket rcvbuf overflowing are minimized
+* Is still useful for troubleshooting early on in the boot process, before the system clock is set
+* Controls for system clock jumps
+
+## Features
+
+From these goals, we derive the following features
+
+* Multiple CAN networks logged from one process (performance, utility)
+* Logs are rotated, compressed, and follow a retention policy (utility)
+* Filename includes the start time of the log (utility)
+  * Needs further consideration together with system-clock jumps, especially early on in the boot
+    process.
+* Address claim PGN requests can be optionally sent upon rotation (utility)
+* Bus state changes are logged to stderr (utility)
+* Utilize io_uring with multishoti to batch receive across multiple interfaces (performance,
+  low-spec system)
+* Streaming compression when writing to disk (performance, disk wear, disk usage)
+* Partial frames are not written (corruption)
+* Streaming compression does not require an epilogue at the tail to decompress the file (corruption)
+* Writes are buffered (performance)
+* Dedicated receive and write threads (minimize drops on a low-spec system)
+* Multiple output formats are supported: can-utils candump, Vector ASC, PCAP (utility)
+  * PCAP, as a binary format, is expected to have a lower disk usage, wear, and compression
+    footprint than the can-utils ASCII format. This needs to be verified.
+
+Note: Many of the performance justifications for features are based on practical experience with
+proprietary solutions I cannot share. So it looks like naive "but, performance!" handwaving, but it
+_is_ based on experience. Additionally, some of the features exist to work around other constraints
+(having a fixed small rmem_max, or low system specs).
+
+Note: CAN SKBs have a higher overhead than I originally imagined. It differs based on kernel version
+and features, but the `recv_cost` benchmark uses uses `SK_MEMINFO_RMEM_ALLOC` and a probe frame to
+calculate the size of each SKB as 960 bytes on my x64 Fedora 42 system. That's enough room for 220
+frames on my system.
+
+On low-spec systems I have worked on, that is not enough room to prevent frame drops when `write()`
+calls sporadically block for multiple seconds. This is the primary motivation for offloading the
+formatting, compression, and writing off onto a secondary thread. It's very likely that one thread
+could handle the performance cost of everything, but blocking writes can, and do cause frame drops
+on the real-world systems I'm writing this tool to support.
+
+## Needs further design
+
+The goals around handling invalid system clocks need further thought. It's useful to save the start
+timestamp in the filename when it's created. But if the system clock isn't known at that time, or if
+it's 30,000 years in the future, what do we do?
+
+Additionally, how do we handle clock jumps in the middle of a log?
+
+A potential useful feature is to include a monotonic file index in each filename so even if the
+timestamp isn't known, we can tell what order messages were received in.
+
+Additionally, the candumpr process should log to stderr upon error, rotation, bus events, clock
+jumps, etc.
diff --git a/docs/design/candumpr/02-architecture.md b/docs/design/candumpr/02-architecture.md
new file mode 100644
index 0000000..c83aba0
--- /dev/null
+++ b/docs/design/candumpr/02-architecture.md
@@ -0,0 +1,121 @@
+# candumpr architecture
+
+Status: **PROPOSAL**
+
+# Scope
+
+This document proposes the core data pipeline for candumpr.
+
+# Goals
+
+The baseline implementation that this proposal intends to improve upon is using one can-utils
+`candump` process to do blocking receives for each logged network. On low-spec systems, this results
+in a noticeable performance impact, which would be manageable, except that the logging on those
+systems is ancillary to the application software those systems are primarily responsible for.
+
+Paraphrasing the goals from [01-goals.md](/docs/design/candumpr/01-goals.md), the overall goal for
+candumpr is to reduce the system performance impact of using candump in this manner.
+
+# Proposed architecture
+
+The proposal is to use one shared receive thread that uses io_uring multishot to batch receives
+across multiple networks. This reduces the number of syscalls per frame to less than one-per frame.
+This reduces the overall context switching cost when logging multiple networks.
+
+As I intend to support systems with slow disks (`write()` syscalls that sporadically block for
+multiple seconds), the receive thread is decoupled from the format + write thread, which also
+services multiple networks. See: <https://github.com/linux-can/can-utils/issues/381> for additional
+background.
+
+Assume a worst-case throughput of 8x 500kbaud networks at 100% busload. That's 500KB/s of raw data,
+plus some inflationary factor from the formatter (formatting as ASCII adds a constant scalar to the
+throughput). This is well within the formatting, compression, and write capabilities of a single
+thread.
+
+```mermaid
+graph TD
+    can0 & can1 & can2 & can3 --> io_uring
+
+    subgraph recv [recv thread]
+        io_uring
+    end
+
+    io_uring --> |spsc| formatter
+
+    subgraph write [write thread]
+        subgraph formatter
+            direction TD
+
+            can-utils-file
+            can-utils-console
+            vector-asc
+            pcap
+        end
+
+        formatter --> |"&[u8]"| Writer
+
+        subgraph writer
+            Writer --> zstd & BufWriter & stdout
+            zstd --> RotationHandler
+            BufWriter --> RotationHandler
+            RotationHandler --> file
+        end
+    end
+```
+
+## Receiver detail
+
+There are many ways in which a receiver thread or threads could be built using Linux syscalls:
+
+* candump-style blocking `read()` in a dedicated thread per interface
+* `epoll()` and non-blocking `read()` to wake up and receive frames one-by-one when they arrive
+* `epoll()` and non-blocking `recvmmsg()` to receive as many ready frames as possible on any wakeup
+* `io_uring` singleshot - each SQE represents one `read()` - after reading from a socket, the `Recv`
+  opcode is resubmitted.
+* `io_uring` multishot - one SQE submitted for each socket with the `RecvMsgMulti` opcode and
+  `submit_with_args(batch=4, timeout=100ms)` to wait until `batch` frames are ready to receive
+  together from any interfaces.
+
+The multishot io_uring receiver strategy results in _significantly_ fewer syscalls and context
+switches per ready CAN frame, resulting in overall better system performance, and degradation under
+contention.
+
+The batch size could be significantly increased when logging to a file, but when logging to
+`stdout`, we should use a lower batch size (like 4) to facilitate watching a live log. We cannot
+infinitely increase the batch size - there's a tipping point at which if we increase it too far, we
+run the risk of filling up the recvbuf. A batch size of 32 or 64 seems like a reasonable upper
+limit.
+
+## Formatter detail
+
+Use a Strategy design pattern to format the CAN frame into a bytearray to be written. The output is
+a bytearray that may include multiple frames, and an indication of which interface the formatted
+frames came from (so they can be written to the appropriate file). The bytearray never always
+includes full frames; a frame will never be split across multiple bytearrays.
+
+## Compression detail
+
+There's three approaches I've been able to find:
+
+1. Independent concatenated frames - periodically call `.finish()` on the zstd `Encoder`, and
+   probably call `fsync()` as well
+
+   Output is decompressable with `zstd -d`. I think with large-ish frames (1MB?) the compression
+   ratio might be good enough the simplicity of this approach wins over the complexity of managing
+   training dictionaries in a production context.
+
+2. Independent concatenated frames with a pretrained dictionary - train a dictionary on CAN data
+
+   Output is decompressable with `zstd -d -D can.dict`. It might be best to train a dictionary
+   specific to each format? We would need to maintain and ship pre-trained dictionaries, and make
+   them available to engineers troubleshooting CAN traffic.
+
+   I think it could be easy to add a configuration option to candumpr to provide your own zstd
+   dictionary, in which case candumpr's own implementation doesn't bear the burden of the dictionary
+   training, that's offloaded onto the consumer.
+
+3. Prefix-linked frames - persist the compressor state from previous frames when compressing the
+   next.
+
+   Best compression ratio. Output is **not** decompressable with `zstd -d`, would need to implement
+   a custom decompressor, which I think eliminates this option from consideration.
diff --git a/docs/design/candumpr/04-benchmarks.md b/docs/design/candumpr/04-benchmarks.md
new file mode 100644
index 0000000..d5b171c
--- /dev/null
+++ b/docs/design/candumpr/04-benchmarks.md
@@ -0,0 +1,233 @@
+# candumpr benchmarks
+
+candumpr isn't doing a lot of heavy-duty expensive _computation_. However, we've experienced
+performance hits from running multiple instances of `candump` together on a low-spec system, so if
+we want to build something with lower impact, we should measure and compare.
+
+The [02-architecture.md](/docs/design/candumpr/02-architecture.md) design document outlines several
+basic strategies for receiving CAN frames from multiple networks at once. This document benchmarks
+each using three different benchmarks.
+
+1. `recv_cost` - measure the userspace CPU cost of each implementation.
+
+   This benchmark is the least valuable, as a receiver executing X% more or less instructions is
+   less impactful to the overall system performance than the amount of context switches and
+   user/kernel CPU time.
+2. `recv_impact` - pin the benchmark to 4 cores and measure the following metrics for each
+   implementation:
+   1. dropped frames
+   2. user and kernel CPU time
+   3. voluntary and involuntary context switches
+3. `recv_contention` - execute the same benchmark as `recv_impact`, but spin 4 threads doing
+   PWM-style spinloops to hit 75% and 90% CPU usage in each thread.
+
+These benchmarks can be run with
+
+```sh
+cargo install gungruan-runner
+cargo bench
+```
+
+# Results
+
+## recv_cost
+
+```
+recv_cost::recv_cost::dedicated run:setup_blocking()
+  Instructions:                      454664|N/A                  (*********)
+  L1 Hits:                           866130|N/A                  (*********)
+  LL Hits:                            10552|N/A                  (*********)
+  RAM Hits:                             169|N/A                  (*********)
+  Total read+write:                  876851|N/A                  (*********)
+  Estimated Cycles:                  924805|N/A                  (*********)
+recv_cost::recv_cost::epoll run:setup_nonblocking()
+  Instructions:                      519312|N/A                  (*********)
+  L1 Hits:                           960112|N/A                  (*********)
+  LL Hits:                            10255|N/A                  (*********)
+  RAM Hits:                              52|N/A                  (*********)
+  Total read+write:                  970419|N/A                  (*********)
+  Estimated Cycles:                 1013207|N/A                  (*********)
+  Comparison with dedicated run:setup_blocking()
+    Instructions:                      454664|519312               (-12.4488%) [-1.14219x]
+    L1 Hits:                           866130|960112               (-9.78865%) [-1.10851x]
+    LL Hits:                            10552|10255                (+2.89615%) [+1.02896x]
+    RAM Hits:                             169|52                   (+225.000%) [+3.25000x]
+    Total read+write:                  876851|970419               (-9.64202%) [-1.10671x]
+    Estimated Cycles:                  924805|1013207              (-8.72497%) [-1.09559x]
+recv_cost::recv_cost::recvmmsg run:setup_nonblocking()
+  Instructions:                      468571|N/A                  (*********)
+  L1 Hits:                           882834|N/A                  (*********)
+  LL Hits:                            10262|N/A                  (*********)
+  RAM Hits:                              57|N/A                  (*********)
+  Total read+write:                  893153|N/A                  (*********)
+  Estimated Cycles:                  936139|N/A                  (*********)
+  Comparison with dedicated run:setup_blocking()
+    Instructions:                      454664|468571               (-2.96796%) [-1.03059x]
+    L1 Hits:                           866130|882834               (-1.89209%) [-1.01929x]
+    LL Hits:                            10552|10262                (+2.82596%) [+1.02826x]
+    RAM Hits:                             169|57                   (+196.491%) [+2.96491x]
+    Total read+write:                  876851|893153               (-1.82522%) [-1.01859x]
+    Estimated Cycles:                  924805|936139               (-1.21072%) [-1.01226x]
+  Comparison with epoll run:setup_nonblocking()
+    Instructions:                      519312|468571               (+10.8289%) [+1.10829x]
+    L1 Hits:                           960112|882834               (+8.75340%) [+1.08753x]
+    LL Hits:                            10255|10262                (-0.06821%) [-1.00068x]
+    RAM Hits:                              52|57                   (-8.77193%) [-1.09615x]
+    Total read+write:                  970419|893153               (+8.65093%) [+1.08651x]
+    Estimated Cycles:                 1013207|936139               (+8.23254%) [+1.08233x]
+recv_cost::recv_cost::uring run:setup_nonblocking()
+  Instructions:                      587770|N/A                  (*********)
+  L1 Hits:                          1071728|N/A                  (*********)
+  LL Hits:                            10285|N/A                  (*********)
+  RAM Hits:                             119|N/A                  (*********)
+  Total read+write:                 1082132|N/A                  (*********)
+  Estimated Cycles:                 1127318|N/A                  (*********)
+  Comparison with dedicated run:setup_blocking()
+    Instructions:                      454664|587770               (-22.6459%) [-1.29276x]
+    L1 Hits:                           866130|1071728              (-19.1838%) [-1.23738x]
+    LL Hits:                            10552|10285                (+2.59601%) [+1.02596x]
+    RAM Hits:                             169|119                  (+42.0168%) [+1.42017x]
+    Total read+write:                  876851|1082132              (-18.9701%) [-1.23411x]
+    Estimated Cycles:                  924805|1127318              (-17.9641%) [-1.21898x]
+  Comparison with epoll run:setup_nonblocking()
+    Instructions:                      519312|587770               (-11.6471%) [-1.13182x]
+    L1 Hits:                           960112|1071728              (-10.4146%) [-1.11625x]
+    LL Hits:                            10255|10285                (-0.29169%) [-1.00293x]
+    RAM Hits:                              52|119                  (-56.3025%) [-2.28846x]
+    Total read+write:                  970419|1082132              (-10.3234%) [-1.11512x]
+    Estimated Cycles:                 1013207|1127318              (-10.1223%) [-1.11262x]
+  Comparison with recvmmsg run:setup_nonblocking()
+    Instructions:                      468571|587770               (-20.2799%) [-1.25439x]
+    L1 Hits:                           882834|1071728              (-17.6252%) [-1.21396x]
+    LL Hits:                            10262|10285                (-0.22363%) [-1.00224x]
+    RAM Hits:                              57|119                  (-52.1008%) [-2.08772x]
+    Total read+write:                  893153|1082132              (-17.4636%) [-1.21159x]
+    Estimated Cycles:                  936139|1127318              (-16.9587%) [-1.20422x]
+recv_cost::recv_cost::uring_multi run:setup_nonblocking()
+  Instructions:                      686528|N/A                  (*********)
+  L1 Hits:                          1265738|N/A                  (*********)
+  LL Hits:                            11611|N/A                  (*********)
+  RAM Hits:                             217|N/A                  (*********)
+  Total read+write:                 1277566|N/A                  (*********)
+  Estimated Cycles:                 1331388|N/A                  (*********)
+  Comparison with dedicated run:setup_blocking()
+    Instructions:                      454664|686528               (-33.7734%) [-1.50997x]
+    L1 Hits:                           866130|1265738              (-31.5711%) [-1.46137x]
+    LL Hits:                            10552|11611                (-9.12066%) [-1.10036x]
+    RAM Hits:                             169|217                  (-22.1198%) [-1.28402x]
+    Total read+write:                  876851|1277566              (-31.3655%) [-1.45699x]
+    Estimated Cycles:                  924805|1331388              (-30.5383%) [-1.43964x]
+  Comparison with epoll run:setup_nonblocking()
+    Instructions:                      519312|686528               (-24.3568%) [-1.32200x]
+    L1 Hits:                           960112|1265738              (-24.1461%) [-1.31832x]
+    LL Hits:                            10255|11611                (-11.6786%) [-1.13223x]
+    RAM Hits:                              52|217                  (-76.0369%) [-4.17308x]
+    Total read+write:                  970419|1277566              (-24.0416%) [-1.31651x]
+    Estimated Cycles:                 1013207|1331388              (-23.8984%) [-1.31403x]
+  Comparison with recvmmsg run:setup_nonblocking()
+    Instructions:                      468571|686528               (-31.7477%) [-1.46515x]
+    L1 Hits:                           882834|1265738              (-30.2514%) [-1.43372x]
+    LL Hits:                            10262|11611                (-11.6183%) [-1.13146x]
+    RAM Hits:                              57|217                  (-73.7327%) [-3.80702x]
+    Total read+write:                  893153|1277566              (-30.0895%) [-1.43040x]
+    Estimated Cycles:                  936139|1331388              (-29.6870%) [-1.42221x]
+  Comparison with uring run:setup_nonblocking()
+    Instructions:                      587770|686528               (-14.3851%) [-1.16802x]
+    L1 Hits:                          1071728|1265738              (-15.3278%) [-1.18103x]
+    LL Hits:                            10285|11611                (-11.4202%) [-1.12893x]
+    RAM Hits:                             119|217                  (-45.1613%) [-1.82353x]
+    Total read+write:                 1082132|1277566              (-15.2974%) [-1.18060x]
+    Estimated Cycles:                 1127318|1331388              (-15.3276%) [-1.18102x]
+```
+
+**NOTE:** the `uring_multi` benchmark is noticeably more expensive in terms of CPU cost than any of
+the other receivers.
+
+**NOTE:** the callgrind counters only include userspace, not any kernelspace processing.
+
+## 4-core recv_impact
+
+| backend     | ifaces | rate | sent  | recv  | lost | user_ms | sys_ms | vol_csw | invol_csw |
+| ----------- | ------ | ---- | ----- | ----- | ---- | ------- | ------ | ------- | --------- |
+| dedicated   | 1      | 1000 | 5000  | 5000  | 0    | 6.2     | 0.0    | 5002    | 0         |
+| dedicated   | 1      | 2000 | 10000 | 10000 | 0    | 12.0    | 0.0    | 10002   | 0         |
+| dedicated   | 1      | 4000 | 20000 | 20000 | 0    | 23.4    | 0.0    | 19992   | 1         |
+| dedicated   | 2      | 1000 | 10000 | 10000 | 0    | 12.7    | 0.0    | 10002   | 0         |
+| dedicated   | 2      | 2000 | 20000 | 20000 | 0    | 23.5    | 0.0    | 20002   | 0         |
+| dedicated   | 2      | 4000 | 40000 | 40000 | 0    | 45.4    | 0.0    | 39999   | 2         |
+| dedicated   | 4      | 1000 | 20000 | 20000 | 0    | 23.0    | 0.0    | 20007   | 8         |
+| dedicated   | 4      | 2000 | 40000 | 40000 | 0    | 27.6    | 17.6   | 39994   | 43        |
+| dedicated   | 4      | 4000 | 80000 | 80000 | 0    | 43.2    | 43.7   | 79976   | 74        |
+| epoll       | 1      | 1000 | 5000  | 5000  | 0    | 0.0     | 7.9    | 5002    | 0         |
+| epoll       | 1      | 2000 | 10000 | 10000 | 0    | 0.0     | 14.9   | 10002   | 0         |
+| epoll       | 1      | 4000 | 20000 | 20000 | 0    | 0.0     | 29.1   | 20000   | 1         |
+| epoll       | 2      | 1000 | 10000 | 10000 | 0    | 0.0     | 15.6   | 9877    | 0         |
+| epoll       | 2      | 2000 | 20000 | 20000 | 0    | 0.0     | 29.2   | 18685   | 2         |
+| epoll       | 2      | 4000 | 40000 | 40000 | 0    | 0.0     | 57.6   | 39354   | 1         |
+| epoll       | 4      | 1000 | 20000 | 20000 | 0    | 0.0     | 26.4   | 15649   | 0         |
+| epoll       | 4      | 2000 | 40000 | 40000 | 0    | 27.3    | 19.8   | 27296   | 79        |
+| epoll       | 4      | 4000 | 80000 | 80000 | 0    | 36.9    | 64.5   | 62625   | 38        |
+| recvmmsg    | 1      | 1000 | 5000  | 5000  | 0    | 1.5     | 6.5    | 5002    | 0         |
+| recvmmsg    | 1      | 2000 | 10000 | 10000 | 0    | 2.9     | 12.3   | 10002   | 0         |
+| recvmmsg    | 1      | 4000 | 20000 | 20000 | 0    | 4.9     | 24.3   | 19990   | 3         |
+| recvmmsg    | 2      | 1000 | 10000 | 10000 | 0    | 2.7     | 11.7   | 8714    | 0         |
+| recvmmsg    | 2      | 2000 | 20000 | 20000 | 0    | 5.4     | 23.6   | 19785   | 1         |
+| recvmmsg    | 2      | 4000 | 40000 | 40000 | 0    | 10.9    | 46.2   | 39655   | 1         |
+| recvmmsg    | 4      | 1000 | 20000 | 20000 | 0    | 5.0     | 21.9   | 16174   | 1         |
+| recvmmsg    | 4      | 2000 | 40000 | 40000 | 0    | 13.9    | 35.8   | 30158   | 9         |
+| recvmmsg    | 4      | 4000 | 80000 | 80000 | 0    | 19.7    | 83.4   | 64193   | 84        |
+| uring       | 1      | 1000 | 5000  | 5000  | 0    | 1.5     | 6.1    | 5052    | 0         |
+| uring       | 1      | 2000 | 10000 | 10000 | 0    | 2.7     | 11.4   | 10051   | 0         |
+| uring       | 1      | 4000 | 20000 | 20000 | 0    | 5.5     | 20.9   | 20045   | 1         |
+| uring       | 2      | 1000 | 10000 | 10000 | 0    | 2.8     | 11.4   | 9870    | 0         |
+| uring       | 2      | 2000 | 20000 | 20000 | 0    | 4.8     | 19.8   | 17398   | 1         |
+| uring       | 2      | 4000 | 40000 | 40000 | 0    | 9.9     | 40.9   | 39676   | 3         |
+| uring       | 4      | 1000 | 20000 | 20000 | 0    | 4.8     | 19.7   | 15157   | 1         |
+| uring       | 4      | 2000 | 40000 | 40000 | 0    | 15.1    | 33.5   | 31504   | 11        |
+| uring       | 4      | 4000 | 80000 | 80000 | 0    | 10.5    | 85.0   | 63739   | 51        |
+| uring_multi | 1      | 1000 | 5000  | 5000  | 0    | 0.0     | 6.1    | 5002    | 0         |
+| uring_multi | 1      | 2000 | 10000 | 10000 | 0    | 0.0     | 11.9   | 10003   | 0         |
+| uring_multi | 1      | 4000 | 20000 | 20000 | 0    | 1.9     | 20.7   | 19981   | 1         |
+| uring_multi | 2      | 1000 | 10000 | 10000 | 0    | 2.1     | 9.3    | 9727    | 0         |
+| uring_multi | 2      | 2000 | 20000 | 20000 | 0    | 4.1     | 18.2   | 19750   | 0         |
+| uring_multi | 2      | 4000 | 40000 | 40000 | 0    | 7.9     | 34.5   | 39573   | 2         |
+| uring_multi | 4      | 1000 | 20000 | 20000 | 0    | 3.9     | 17.0   | 15835   | 10        |
+| uring_multi | 4      | 2000 | 40000 | 40000 | 0    | 6.8     | 33.1   | 28508   | 46        |
+| uring_multi | 4      | 4000 | 80000 | 80000 | 0    | 13.8    | 67.9   | 64548   | 9         |
+
+## 4-core recv_contention
+
+### ~75% CPU
+
+| backend     | ifaces | rate | sent  | recv  | lost | user_ms | sys_ms | vol_csw | invol_csw |
+| ----------- | ------ | ---- | ----- | ----- | ---- | ------- | ------ | ------- | --------- |
+| dedicated   | 4      | 4000 | 80000 | 80000 | 0    | 10.0    | 40.9   | 58617   | 178       |
+| epoll       | 4      | 4000 | 79982 | 79982 | 0    | 9.2     | 38.8   | 34911   | 324       |
+| recvmmsg    | 4      | 4000 | 79999 | 79999 | 0    | 7.9     | 39.0   | 34242   | 329       |
+| uring       | 4      | 4000 | 79997 | 79997 | 0    | 6.9     | 36.0   | 29406   | 402       |
+| uring_multi | 4      | 4000 | 79995 | 79995 | 0    | 5.3     | 34.9   | 36565   | 227       |
+
+### ~90% CPU
+
+| backend     | ifaces | rate | sent  | recv  | lost | user_ms | sys_ms | vol_csw | invol_csw |
+| ----------- | ------ | ---- | ----- | ----- | ---- | ------- | ------ | ------- | --------- |
+| dedicated   | 4      | 4000 | 79998 | 79998 | 0    | 8.1     | 27.5   | 56204   | 94        |
+| epoll       | 4      | 4000 | 79995 | 79995 | 0    | 7.7     | 35.3   | 38999   | 145       |
+| recvmmsg    | 4      | 4000 | 80000 | 80000 | 0    | 3.3     | 37.8   | 38645   | 138       |
+| uring       | 4      | 4000 | 79978 | 79978 | 0    | 6.4     | 31.9   | 35142   | 150       |
+| uring_multi | 4      | 4000 | 80000 | 80000 | 0    | 3.4     | 27.9   | 35751   | 85        |
+
+**NOTE:** Fewer involuntary context switches under higher CPU utilization is counter intuitive, but
+correct. It means the receiver is being starved rather than interrupted. Compare the sys_ms kernel
+CPU time between 75% and 90% results.
+
+## Takeaways
+
+* The pure CPU cost of the receivers doesn't matter nearly as much as the number of syscalls and
+  context switches.
+* The multipex methods (epoll, recvmmsg, and uring) are all pretty similar to each other. The
+  uring_multi approach is significantly better than the rest due to batching receives. It's
+  equivalent in cost if we set the batch size to 1.
+* It appears all backends degrade nicely when the system is under high CPU load.
+  * This is with a very cheap frame handler that exerts no backpressure
+* It doesn't look like it's possible to guarantee no dropped frames
diff --git a/docs/developer/quickstart.md b/docs/developer/quickstart.md
new file mode 100644
index 0000000..b32ffae
--- /dev/null
+++ b/docs/developer/quickstart.md
@@ -0,0 +1,98 @@
+# Developer quickstart
+
+## MSRV
+
+This is a Cargo virtual workspace project. All crates are versioned and released together. The MSRV
+is Rust 1.89.
+
+The minimum supported target environment is Linux 6.1+ with a 4-core ~1Ghz ARM CPU with ~1GB of
+memory. Many of the design choices reflect the constraints of this environment.
+
+## Build, and lint
+
+Building and testing is the usual:
+
+```sh
+cargo build
+cargo clippy --all-targets
+```
+
+This project uses custom rustfmt options that make resolving merge conflicts on module imports much
+easier to resolve:
+
+```sh
+cargo fmt -- --config group_imports=StdExternalCrate,imports_granularity=Module
+```
+
+There are examples you can run with `cargo run --example=dump`. You likely need to create at least
+one vcan network on your development host:
+
+```sh
+sudo ip link add dev can0 type vcan
+sudo ip link set up can0
+```
+
+## Tests
+
+Tests may be run either with `cargo test` or <https://nexte.st>:
+
+```sh
+cargo test
+cargo nextest run
+```
+
+### Test fixtures
+
+There are test fixtures provided by the [vcan-fixture](/vcan-fixture/src/lib.rs) crate. This
+provides several features:
+
+* `enter_namespace()` - enter a process namespace that allows creating vcan networks, which would
+  otherwise require additional permissions outside the namespace.
+* `VcanHarness::new(num)` - create a number of unique vcan interfaces - this is thread safe, and is
+  intended for use in tests.
+* `bench::getrusage_thread()` and `getrusage_self()` - get resource usages for the current thread or
+  process. This measures user and system time, as well as context switches. Other resources could be
+  added in the future.
+* `bench::pin_to_cores(n)` - pin the current process to the first `n` CPU cores
+* `bench::start_cpu_load(num, percent)` - starts `num` threads doing a PWM-like busyloop to hit
+  `percent` CPU usage
+
+It's assumed that the local developer environment has the necessary vcan kernel module. In CI, we
+attempt to install the vcan module, but can skip the vcan-dependent tests with a warning if it's not
+available.
+
+### ASAN
+
+As this project uses quite a bit of `unsafe` Rust to interact with `libc`, it's important to run
+with ASAN. You can do this with:
+
+```sh
+# tests
+RUSTFLAGS="$RUSTFLAGS -Zsanitizer=address" cargo +nightly nextest run -Zbuild-std --target x86_64-unknown-linux-gnu
+# example
+RUSTFLAGS="$RUSTFLAGS -Zsanitizer=address" cargo +nightly run -Zbuild-std --target x86_64-unknown-linux-gnu --example=dump
+```
+
+## Benchmarks
+
+This project includes several benchmarks. Some of them depend on
+[gungraun](https://gungraun.github.io/gungraun/latest/html/index.html):
+
+```sh
+cargo install gungraun-runner
+cargo bench
+```
+
+## Release process
+
+This project isn't released to <https://crates.io>, but there is still a GitHub release workflow.
+Here's the release checklist:
+
+* [ ] Use SemVer to pick an appropriate version number
+* [ ] Edit the workspace [Cargo.toml](/Cargo.toml)'s `workspace.package.version`
+* [ ] Ensure the [CHANGELOG.md](/CHANGELOG.md) has a heading for the new version
+* [ ] Check the changelog entry. Did anything get forgotten? Is it formatted well? Spelling,
+      phrasing, grammar, etc.
+* [ ] Merge a PR including the Cargo.toml and CHANGELOG.md changes.
+  * [ ] A Git tag will be generated
+  * [ ] The contents of the CHANGELOG.md will be used to create a GitHub release
diff --git a/docs/candumpr-configuration.md b/docs/user/candumpr-configuration.md
similarity index 100%
rename from docs/candumpr-configuration.md
rename to docs/user/candumpr-configuration.md
diff --git a/vcan-fixture/Cargo.toml b/vcan-fixture/Cargo.toml
index f324eee..a645c2a 100644
--- a/vcan-fixture/Cargo.toml
+++ b/vcan-fixture/Cargo.toml
@@ -10,6 +10,7 @@ description = "Build vcan interfaces in isolated network namespaces"
 ci = []
 
 [dependencies]
+assert_cmd.workspace = true
 ctor.workspace = true
 eyre.workspace = true
 libc.workspace = true
diff --git a/vcan-fixture/src/cmd.rs b/vcan-fixture/src/cmd.rs
new file mode 100644
index 0000000..9889d31
--- /dev/null
+++ b/vcan-fixture/src/cmd.rs
@@ -0,0 +1,43 @@
+use std::process::Output;
+
+pub use assert_cmd::Command;
+
+pub trait CommandExt {
+    /// Same as [Command::output] except with hooks to print stdout/stderr in failed tests
+    fn captured_output(&mut self) -> std::io::Result<Output>;
+}
+
+impl CommandExt for Command {
+    fn captured_output(&mut self) -> std::io::Result<Output> {
+        let output = self.output()?;
+
+        // libtest injects magic in print! macros to capture output in tests
+        print!("{}", String::from_utf8_lossy(&output.stdout));
+        eprint!("{}", String::from_utf8_lossy(&output.stderr));
+
+        Ok(output)
+    }
+}
+
+/// Get a command to run the given tool binary.
+///
+/// Uses `CARGO_BIN_EXE_<name>` which cargo sets at compile time for integration tests in the same
+/// crate as the binary.
+///
+/// # Example
+/// ```ignore
+/// use vcan_fixture::cmd::{tool, CommandExt};
+///
+/// let output = tool!("candumpr")
+///     .arg("--help")
+///     .captured_output()
+///     .unwrap();
+/// ```
+#[macro_export]
+macro_rules! tool {
+    ($name:literal) => {{
+        let mut cmd = $crate::Command::new(env!(concat!("CARGO_BIN_EXE_", $name)));
+        cmd.arg("--log-level=TRACE");
+        cmd
+    }};
+}
diff --git a/vcan-fixture/src/lib.rs b/vcan-fixture/src/lib.rs
index 500ab91..5eed516 100644
--- a/vcan-fixture/src/lib.rs
+++ b/vcan-fixture/src/lib.rs
@@ -49,6 +49,7 @@
 //! available by default.
 
 pub mod bench;
+pub mod cmd;
 mod netlink;
 
 use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};