diff --git a/.codecov.yml b/.codecov.yml
index a393180b..4caafbf6 100644
--- a/.codecov.yml
+++ b/.codecov.yml
@@ -10,6 +10,7 @@ coverage:
 
 ignore:
   - "ext/**/*"
+  - "src/legacy/**/*"
 
 comment:
   layout: "reach,diff,flags,files"
diff --git a/Project.toml b/Project.toml
index 463d051d..c61ccfa7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "AdaptiveArrayPools"
 uuid = "4f381ef7-9af0-4cbe-99d4-cf36d7b0f233"
-version = "0.2.1"
 authors = ["Min-Gu Yoo <mgyoo86@gmail.com>"]
+version = "0.2.1"
 
 [deps]
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
@@ -14,7 +14,7 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 AdaptiveArrayPoolsCUDAExt = "CUDA"
 
 [compat]
-julia = "1.10"
-Preferences = "1"
 CUDA = "5"
-Printf = "1"
\ No newline at end of file
+Preferences = "1"
+Printf = "1"
+julia = "1.10"
diff --git a/docs/design/cuda_extension_design.md b/docs/design/cuda_extension_design.md
index 5b31c2d8..f7eb7ee7 100644
--- a/docs/design/cuda_extension_design.md
+++ b/docs/design/cuda_extension_design.md
@@ -1,9 +1,10 @@
 # AdaptiveArrayPools.jl CUDA Extension Design
 
-> **Status**: Draft v0.6 (Post-Review Revision)
-> **Version**: 0.6
-> **Date**: 2024-12-14
-> **Authors**: Design discussion with AI assistance
+> **Update (v0.2.2, feat/new_array_nd)**: The CPU path now uses `setfield!`-based wrapper
+> reuse (Julia 1.11+) instead of the N-way cache for `unsafe_acquire!`. The **CUDA extension
+> still uses the N-way set-associative cache** described in this document, since `CuArray`
+> does not support `setfield!`-based field mutation. `CACHE_WAYS` and `set_cache_ways!` are
+> now only relevant for the CUDA backend (and Julia 1.10 legacy CPU path).
 
 ## 1. Executive Summary
 
diff --git a/docs/design/hybrid_api_design.md b/docs/design/hybrid_api_design.md
index 3d8c84a4..b05d0f17 100644
--- a/docs/design/hybrid_api_design.md
+++ b/docs/design/hybrid_api_design.md
@@ -1,5 +1,13 @@
 # Hybrid API Design: acquire! vs unsafe_acquire!
 
+> **Update (v0.2.2, feat/new_array_nd)**: The `unsafe_acquire!` path no longer uses
+> `unsafe_wrap` + N-way cache on Julia 1.11+ CPU. Instead, it uses `setfield!`-based
+> wrapper reuse — **0-alloc for any number of dimension patterns** (no eviction limit).
+> The N-way cache (`CACHE_WAYS`) is now only used by the **CUDA** backend and the
+> **Julia 1.10 legacy** fallback. The `acquire!` → `ReshapedArray` path is unchanged.
+> `TypedPool` fields changed: `nd_arrays`/`nd_dims`/`nd_ptrs`/`nd_next_way` →
+> `nd_wrappers::Vector{Union{Nothing, Vector{Any}}}`.
+
 ## Executive Summary
 
 Redesigning `AdaptiveArrayPools.jl`'s N-D array acquisition API with a **Two Tools Strategy**:
diff --git a/docs/design/nd_array_approach_comparison.md b/docs/design/nd_array_approach_comparison.md
index d4a531fd..04653bff 100644
--- a/docs/design/nd_array_approach_comparison.md
+++ b/docs/design/nd_array_approach_comparison.md
@@ -1,5 +1,12 @@
 # N-D Array Approach Comparison: unsafe_wrap vs ReshapedArray
 
+> **Update (v0.2.2, feat/new_array_nd)**: The N-way set-associative cache described in this
+> document has been **superseded on Julia 1.11+ CPU** by `setfield!`-based wrapper reuse
+> (`nd_wrappers` indexed by dimensionality N). This achieves **0-alloc for unlimited dimension
+> patterns** — no eviction, no `CACHE_WAYS` limit. The N-way cache remains in use for
+> **CUDA** and the **Julia 1.10 legacy** path. The `acquire!` → `ReshapedArray` path is
+> unchanged. See `src/acquire.jl` and `src/types.jl` for the current implementation.
+
 ## Summary
 
 This document analyzes two approaches for returning N-dimensional arrays from AdaptiveArrayPools:
diff --git a/docs/src/architecture/design-docs.md b/docs/src/architecture/design-docs.md
index 4b5e0514..8d9b68a3 100644
--- a/docs/src/architecture/design-docs.md
+++ b/docs/src/architecture/design-docs.md
@@ -10,7 +10,7 @@ For in-depth analysis of design decisions, implementation tradeoffs, and archite
 ## Caching & Performance
 
 - **[nd_array_approach_comparison.md](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/nd_array_approach_comparison.md)**
-  N-way cache design, boxing analysis, and ReshapedArray benchmarks
+  N-way cache design (now legacy — replaced by `setfield!` reuse on Julia 1.11+ CPU), boxing analysis, and ReshapedArray benchmarks
 
 - **[fixed_slots_codegen_design.md](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/fixed_slots_codegen_design.md)**
   Zero-allocation iteration via `@generated` functions and fixed-slot type dispatch
@@ -32,7 +32,7 @@ For in-depth analysis of design decisions, implementation tradeoffs, and archite
 | Document | Focus Area | Key Insights |
 |----------|------------|--------------|
 | hybrid_api_design | API strategy | View types for zero-alloc, Array for FFI |
-| nd_array_approach_comparison | Caching | N-way associative cache reduces header allocation |
+| nd_array_approach_comparison | Caching | N-way cache (legacy); setfield! reuse on Julia 1.11+ CPU |
 | fixed_slots_codegen_design | Codegen | @generated functions enable type-stable iteration |
 | untracked_acquire_design | Macro safety | Sentinel pattern ensures correct cleanup |
 | cuda_extension_design | GPU support | Seamless CPU/CUDA API parity |
diff --git a/docs/src/architecture/how-it-works.md b/docs/src/architecture/how-it-works.md
index 06ba9826..f812e5e0 100644
--- a/docs/src/architecture/how-it-works.md
+++ b/docs/src/architecture/how-it-works.md
@@ -82,57 +82,66 @@ end
 
 When you call `acquire!(pool, Float64, n)`, the compiler inlines directly to `pool.float64` — no dictionary lookup, no type instability.
 
-## N-Way Set Associative Cache
+## N-D Wrapper Reuse (CPU)
 
-For `unsafe_acquire!` (which returns native `Array` types), we use an N-way cache to reduce header allocation:
+For `unsafe_acquire!` (which returns native `Array` types), the caching strategy depends on the Julia version:
+
+### Julia 1.11+: `setfield!`-based Wrapper Reuse (Zero-Allocation)
+
+Julia 1.11 changed `Array` from an opaque C struct to a mutable Julia struct with `ref::MemoryRef{T}` and `size::NTuple{N,Int}` fields. This enables in-place mutation of cached `Array` wrappers via `setfield!`:
 
 ```
-                    CACHE_WAYS = 4 (default)
-                    ┌────┬────┬────┬────┐
-Slot 0 (Float64):   │way0│way1│way2│way3│  ← round-robin eviction
-                    └────┴────┴────┴────┘
-                    ┌────┬────┬────┬────┐
-Slot 1 (Float32):   │way0│way1│way2│way3│
-                    └────┴────┴────┴────┘
-                    ...
+nd_wrappers[N][slot] → cached Array{T,N}
+    │
+    ├─ setfield!(:ref, new_memory_ref)   ← update backing memory (0-alloc)
+    └─ setfield!(:size, new_dims)        ← update dimensions (0-alloc)
 ```
 
-### Cache Lookup Pseudocode
+**Result**: Unlimited dimension patterns per slot with **zero allocation** after warmup. No eviction, no round-robin, no `CACHE_WAYS` limit.
 
 ```julia
+# Pseudocode for Julia 1.11+ path
 function unsafe_acquire!(pool, T, dims...)
     typed_pool = get_typed_pool!(pool, T)
-    slot = n_active + 1
-    base = (slot - 1) * CACHE_WAYS
-
-    # Search all ways for matching dimensions
-    for k in 1:CACHE_WAYS
-        idx = base + k
-        if dims == typed_pool.nd_dims[idx]
-            # Cache hit! Check if underlying vector was resized
-            if pointer matches
-                return typed_pool.nd_arrays[idx]
-            end
-        end
+    flat_view = get_view!(typed_pool, prod(dims))
+    slot = typed_pool.n_active
+
+    # Direct index lookup by dimensionality N (~1ns)
+    wrapper = typed_pool.nd_wrappers[N][slot]
+    if wrapper !== nothing
+        setfield!(wrapper, :ref, getfield(vec, :ref))  # 0-alloc
+        setfield!(wrapper, :size, dims)                 # 0-alloc
+        return wrapper
     end
 
-    # Cache miss: create new Array header, store in next way (round-robin)
-    way = typed_pool.nd_next_way[slot]
-    typed_pool.nd_next_way[slot] = (way + 1) % CACHE_WAYS
-    # ... create and cache Array ...
+    # First call for this (slot, N): unsafe_wrap once, cached forever
+    arr = wrap_array(typed_pool, flat_view, dims)
+    store_wrapper!(typed_pool, N, slot, arr)
+    return arr
 end
 ```
 
-**Key insight**: Even on cache miss, only the `Array` header (~80-144 bytes) is allocated. The actual data memory is always reused from the pool.
+### Julia 1.10 (Legacy): N-Way Set Associative Cache
+
+On Julia 1.10, `Array` fields cannot be mutated, so the legacy path uses a 4-way set-associative cache with round-robin eviction:
+
+- Cache hit (≤`CACHE_WAYS` dimension patterns per slot): **0 bytes**
+- Cache miss (>`CACHE_WAYS` patterns): **~80-144 bytes** per `unsafe_wrap` call
+
+See [Configuration](../features/configuration.md) for `CACHE_WAYS` tuning (Julia 1.10 / CUDA only).
+
+### CUDA: N-Way Cache
+
+The CUDA backend still uses the N-way set-associative cache (same as Julia 1.10 legacy), since `CuArray` does not support `setfield!`-based mutation.
 
 ## View vs Array Return Types
 
 Type stability is critical for performance. AdaptiveArrayPools provides two APIs:
 
-| API | 1D Return | N-D Return | Allocation |
-|-----|-----------|------------|------------|
-| `acquire!` | `SubArray{T,1}` | `ReshapedArray{T,N}` | Always 0 bytes |
-| `unsafe_acquire!` | `Vector{T}` | `Array{T,N}` | 0 bytes (hit) / ~100 bytes (miss) |
+| API | 1D Return | N-D Return | Allocation (Julia 1.11+) | Allocation (Julia 1.10 / CUDA) |
+|-----|-----------|------------|--------------------------|-------------------------------|
+| `acquire!` | `SubArray{T,1}` | `ReshapedArray{T,N}` | Always 0 bytes | Always 0 bytes |
+| `unsafe_acquire!` | `Vector{T}` | `Array{T,N}` | 0 bytes (setfield! reuse) | 0 bytes (hit) / ~100 bytes (miss) |
 
 !!! note "`Bit` type behavior"
     For `T === Bit`, both `acquire!` and `unsafe_acquire!` return native `BitVector` / `BitArray{N}` (not views). Cache hit achieves 0 bytes allocation.
diff --git a/docs/src/architecture/type-dispatch.md b/docs/src/architecture/type-dispatch.md
index 1feeb6fa..416d7fd9 100644
--- a/docs/src/architecture/type-dispatch.md
+++ b/docs/src/architecture/type-dispatch.md
@@ -21,57 +21,39 @@ end
 
 When you call `acquire!(pool, Float64, n)`, the compiler inlines directly to `pool.float64` - no dictionary lookup, no type instability.
 
-## N-Way Set Associative Cache
+## N-D Wrapper Caching for `unsafe_acquire!`
 
-For `unsafe_acquire!` (which returns native `Array` types), we use an N-way cache to reduce header allocation:
+`unsafe_acquire!` returns native `Array` types. The caching strategy depends on Julia version:
 
-```
-                    CACHE_WAYS = 4 (default)
-                    +----+----+----+----+
-Slot 0 (Float64):   |way0|way1|way2|way3|  <-- round-robin eviction
-                    +----+----+----+----+
-                    +----+----+----+----+
-Slot 1 (Float32):   |way0|way1|way2|way3|
-                    +----+----+----+----+
-                    ...
-```
+### Julia 1.11+: `setfield!`-based Wrapper Reuse
 
-### Cache Lookup Logic
+Julia 1.11 made `Array` a mutable struct, enabling in-place field mutation:
 
 ```julia
-function unsafe_acquire!(pool, T, dims...)
-    typed_pool = get_typed_pool!(pool, T)
-    slot = n_active + 1
-    base = (slot - 1) * CACHE_WAYS
-
-    # Search all ways for matching dimensions
-    for k in 1:CACHE_WAYS
-        idx = base + k
-        if dims == typed_pool.nd_dims[idx]
-            # Cache hit! Check if underlying vector was resized
-            if pointer matches
-                return typed_pool.nd_arrays[idx]
-            end
-        end
-    end
-
-    # Cache miss: create new Array header, store in next way (round-robin)
-    way = typed_pool.nd_next_way[slot]
-    typed_pool.nd_next_way[slot] = (way % CACHE_WAYS) + 1
-    # ... create and cache Array ...
-end
+# Cached wrapper reuse via setfield! (0-alloc)
+setfield!(cached_arr, :ref, new_memory_ref)   # update backing memory
+setfield!(cached_arr, :size, new_dims)         # update dimensions
 ```
 
-**Key insight**: Even on cache miss, only the `Array` header (~80-144 bytes) is allocated. The actual data memory is always reused from the pool.
+Wrappers are stored in `nd_wrappers::Vector{Union{Nothing, Vector{Any}}}`, indexed directly by dimensionality N (~1ns lookup). **Unlimited dimension patterns per slot, zero allocation after warmup.**
+
+### Julia 1.10 / CUDA: N-Way Set Associative Cache
+
+On Julia 1.10 (CPU) and CUDA, `Array`/`CuArray` fields cannot be mutated. These paths use a 4-way set-associative cache with round-robin eviction (`CACHE_WAYS = 4` default):
+
+- **Cache hit** (≤4 dim patterns per slot): 0 bytes
+- **Cache miss** (>4 patterns): ~80-144 bytes for Array header allocation
+
+See [Configuration](../features/configuration.md) for `CACHE_WAYS` tuning.
 
 ---
 
 ## View vs Array: When to Use What?
 
-| API | Return Type | Allocation | Recommended For |
-|-----|-------------|------------|-----------------|
-| `acquire!` | `SubArray` / `ReshapedArray` | **Always 0 bytes** | 99% of cases |
-| `unsafe_acquire!` | `Vector` / `Array` | 0-144 bytes | FFI, type constraints |
+| API | Return Type | Allocation (Julia 1.11+) | Allocation (1.10 / CUDA) | Recommended For |
+|-----|-------------|--------------------------|--------------------------|-----------------|
+| `acquire!` | `SubArray` / `ReshapedArray` | **Always 0 bytes** | **Always 0 bytes** | 99% of cases |
+| `unsafe_acquire!` | `Vector` / `Array` | **0 bytes** (setfield! reuse) | 0-144 bytes (N-way cache) | FFI, type constraints |
 
 ### Why View is the Default
 
@@ -116,15 +98,15 @@ end
 
 | Operation | acquire! (View) | unsafe_acquire! (Array) |
 |-----------|-----------------|-------------------------|
-| Allocation (cached) | 0 bytes | 0 bytes |
-| Allocation (miss) | 0 bytes | 80-144 bytes |
+| Allocation (Julia 1.11+) | 0 bytes | 0 bytes (setfield! reuse) |
+| Allocation (Julia 1.10 / CUDA) | 0 bytes | 0 bytes (hit) / 80-144 bytes (miss) |
 | BLAS operations | Identical | Identical |
 | Type stability | Guaranteed | Guaranteed |
 | FFI compatibility | Requires conversion | Direct |
 
-### Header Size by Dimensionality
+### Header Size by Dimensionality (Julia 1.10 / CUDA only)
 
-When `unsafe_acquire!` has a cache miss:
+On Julia 1.11+ CPU, `unsafe_acquire!` is always zero-allocation via `setfield!` reuse. On Julia 1.10 and CUDA, a cache miss allocates an `Array` header:
 
 | Dimensions | Header Size |
 |------------|-------------|
diff --git a/docs/src/basics/api-essentials.md b/docs/src/basics/api-essentials.md
index c1bd503a..028bbaed 100644
--- a/docs/src/basics/api-essentials.md
+++ b/docs/src/basics/api-essentials.md
@@ -21,7 +21,7 @@ end
 
 ### `unsafe_acquire!(pool, T, dims...)`
 
-Returns a native `Array` type. **Zero-allocation on cache hit**—only allocates a small header (~80-144 bytes) on cache miss. Use when you specifically need `Array{T,N}`:
+Returns a native `Array` type. On **Julia 1.11+**, always **zero-allocation** via `setfield!`-based wrapper reuse (unlimited dimension patterns). On Julia 1.10 and CUDA, zero-allocation on cache hit with a small header (~80-144 bytes) on cache miss. Use when you specifically need `Array{T,N}`:
 
 ```julia
 @with_pool pool begin
@@ -36,7 +36,7 @@ end
 ```
 
 !!! tip "Cache behavior"
-    Same dimension pattern → **0 bytes**. Different pattern → 80-144 bytes header only (data memory always reused). See [N-Way Cache](../architecture/type-dispatch.md#n-way-set-associative-cache) for details.
+    On Julia 1.11+: **always 0 bytes** regardless of dimension pattern (setfield!-based reuse). On Julia 1.10 / CUDA: same dimension pattern → 0 bytes, different pattern → 80-144 bytes header only (data always reused). See [N-D Wrapper Caching](../architecture/type-dispatch.md#n-d-wrapper-caching-for-unsafe_acquire) for details.
 
 !!! note "`Bit` behavior"
     For `T === Bit`, `unsafe_acquire!` is equivalent to `acquire!` and returns native `BitVector`/`BitArray{N}`.
@@ -113,7 +113,7 @@ end
 | Function | Returns | Allocation | Use Case |
 |----------|---------|------------|----------|
 | `acquire!(pool, T, dims...)` | View type | 0 bytes | Default choice |
-| `unsafe_acquire!(pool, T, dims...)` | `Array{T,N}` | 0 (hit) / 80-144 (miss) | FFI, type constraints |
+| `unsafe_acquire!(pool, T, dims...)` | `Array{T,N}` | 0 bytes (1.11+) / 0-144 (1.10/CUDA) | FFI, type constraints |
 | `zeros!(pool, [T,] dims...)` | View type | 0 bytes | Zero-initialized |
 | `ones!(pool, [T,] dims...)` | View type | 0 bytes | One-initialized |
 | `similar!(pool, A)` | View type | 0 bytes | Match existing array |
diff --git a/docs/src/features/bit-arrays.md b/docs/src/features/bit-arrays.md
index c59f4f40..37048004 100644
--- a/docs/src/features/bit-arrays.md
+++ b/docs/src/features/bit-arrays.md
@@ -79,16 +79,14 @@ Operations like `count()`, `sum()`, and bitwise broadcasting are **10x~100x fast
 
 ### N-D Caching & Zero Allocation
 
-The pool uses an N-way associative cache to efficiently reuse `BitArray{N}` instances:
+The pool reuses `BitArray{N}` wrapper instances via `setfield!`-based in-place mutation (Julia 1.11+) or N-way cache (Julia 1.10 / CUDA):
 
-| Scenario | Allocation |
-|----------|------------|
-| First call with new dims | ~944 bytes (new `BitArray{N}` created) |
-| Subsequent call with same dims | **0 bytes** (cached instance reused) |
-| Same ndims, different dims | **0 bytes** (dims/len fields modified in-place) |
-| Different ndims | ~944 bytes (new `BitArray{N}` created and cached) |
+| Scenario | Julia 1.11+ | Julia 1.10 / CUDA |
+|----------|-------------|-------------------|
+| First call with new (slot, N) | ~944 bytes (new `BitArray{N}`) | ~944 bytes |
+| Subsequent call, any dims | **0 bytes** (setfield! reuse) | **0 bytes** (same ndims) / ~944 bytes (different ndims) |
 
-Unlike regular `Array` where dimensions are immutable, `BitArray` allows in-place modification of its `dims` and `len` fields. The pool exploits this to achieve **zero allocation** on repeated calls with matching dimensionality.
+On Julia 1.11+, `BitArray` fields (`len`, `dims`, `chunks`) are mutated in-place via `setfield!`, achieving **zero allocation** on all repeated calls regardless of dimension pattern.
 
 ```julia
 @with_pool pool begin
@@ -98,12 +96,12 @@ Unlike regular `Array` where dimensions are immutable, `BitArray` allows in-plac
     # Rewind to reuse the same slot
     rewind!(pool)
 
-    # Same dims: 0 allocation (exact cache hit)
+    # Same dims: 0 allocation (cached wrapper reused)
     m2 = acquire!(pool, Bit, 100, 100)
 
     rewind!(pool)
 
-    # Different dims but same ndims: 0 allocation (dims modified in-place)
+    # Different dims but same ndims: 0 allocation (fields updated in-place)
     m3 = acquire!(pool, Bit, 50, 200)
 end
 ```
diff --git a/docs/src/features/configuration.md b/docs/src/features/configuration.md
index fdc1d168..4cc3fd0c 100644
--- a/docs/src/features/configuration.md
+++ b/docs/src/features/configuration.md
@@ -70,9 +70,13 @@ POOL_DEBUG[] = false  # Disable (default, production)
 
 When enabled, returning a pool-backed array from a `@with_pool` block will throw an error.
 
-## Compile-time: CACHE_WAYS
+## Compile-time: CACHE_WAYS (Julia 1.10 / CUDA only)
 
-Configure the N-way cache size for `unsafe_acquire!`. Higher values reduce cache eviction but increase memory per slot.
+Configure the N-way cache size for `unsafe_acquire!`. **On Julia 1.11+ CPU, this setting has no effect** — the `setfield!`-based wrapper reuse supports unlimited dimension patterns with zero allocation.
+
+This setting is relevant for:
+- **Julia 1.10** (legacy N-way cache path)
+- **CUDA backend** (N-way cache for `CuArray` wrappers)
 
 ```toml
 # LocalPreferences.toml
@@ -88,15 +92,13 @@ set_cache_ways!(8)
 # Restart Julia for changes to take effect
 ```
 
-**When to increase**: If your code alternates between more than 4 dimension patterns per pool slot, increase `cache_ways` to avoid cache eviction (~100 bytes header per miss).
-
-> **Scope**: `cache_ways` affects **all `unsafe_acquire!`** calls (including 1D). Only `acquire!` 1D uses simple 1:1 caching.
+**When to increase**: If your CUDA code or Julia 1.10 code alternates between more than 4 dimension patterns per pool slot, increase `cache_ways` to avoid cache eviction (~100 bytes header per miss).
 
 ## Summary
 
 | Setting | Scope | Restart? | Priority | Affects |
 |---------|-------|----------|----------|---------|
 | `use_pooling` | Compile-time | Yes | ⭐ Primary | All macros, `acquire!` behavior |
-| `cache_ways` | Compile-time | Yes | Advanced | `unsafe_acquire!` N-D caching |
+| `cache_ways` | Compile-time | Yes | Advanced | `unsafe_acquire!` N-D caching (Julia 1.10 / CUDA only) |
 | `MAYBE_POOLING_ENABLED` | Runtime | No | Optional | `@maybe_with_pool` only |
 | `POOL_DEBUG` | Runtime | No | Debug | Safety validation |
diff --git a/docs/src/features/cuda-support.md b/docs/src/features/cuda-support.md
index c5778c8c..13b2ce50 100644
--- a/docs/src/features/cuda-support.md
+++ b/docs/src/features/cuda-support.md
@@ -47,12 +47,17 @@ The CUDA backend uses the same API as CPU, with `:cuda` backend specifier:
 
 **GPU Memory**: Always 0 bytes allocation after warmup. The underlying `CuVector` is resized as needed and reused.
 
-**CPU Memory**:
-- Cache hit (≤4 dimension patterns per slot): 0 bytes
-- Cache miss (>4 patterns): ~100 bytes for wrapper metadata
+**CPU-side Wrapper Memory** (for `unsafe_acquire!` N-D on CUDA):
+- The CUDA backend uses an N-way set-associative cache for `CuArray` wrapper reuse
+- Cache hit (≤`CACHE_WAYS` dimension patterns per slot): 0 bytes
+- Cache miss (>`CACHE_WAYS` patterns): ~100 bytes for wrapper metadata
+- See [Configuration](configuration.md) for `CACHE_WAYS` tuning
+
+!!! note "CPU vs CUDA caching"
+    On CPU (Julia 1.11+), `unsafe_acquire!` uses `setfield!`-based wrapper reuse with **zero allocation for any number of dimension patterns**. The CUDA backend does not yet support this optimization and still uses the N-way cache.
 
 ```julia
-# Example: 4 patterns fit in 4-way cache → zero CPU allocation
+# Example: 4 patterns fit in default 4-way cache → zero CPU-side allocation
 dims_list = ((10, 10), (5, 20), (20, 5), (4, 25))
 for dims in dims_list
     @with_pool :cuda p begin
diff --git a/docs/src/reference/api.md b/docs/src/reference/api.md
index a6c5c55e..2b808ea0 100644
--- a/docs/src/reference/api.md
+++ b/docs/src/reference/api.md
@@ -51,7 +51,7 @@ Default element type is `Float64` (CPU) or `Float32` (CUDA).
 | `USE_POOLING` | Compile-time constant to disable all pooling. |
 | `MAYBE_POOLING_ENABLED` | Runtime `Ref{Bool}` for `@maybe_with_pool`. |
 | `POOL_DEBUG` | Runtime `Ref{Bool}` to enable safety validation. |
-| `set_cache_ways!(n)` | Set N-way cache size. |
+| `set_cache_ways!(n)` | Set N-way cache size (Julia 1.10 / CUDA only; no effect on Julia 1.11+ CPU). |
 
 ---
 
diff --git a/src/AdaptiveArrayPools.jl b/src/AdaptiveArrayPools.jl
index 7092822f..c369ca5b 100644
--- a/src/AdaptiveArrayPools.jl
+++ b/src/AdaptiveArrayPools.jl
@@ -19,28 +19,25 @@ export AbstractTypedPool, AbstractArrayPool  # For subtyping
 export DisabledPool, DISABLED_CPU, pooling_enabled  # Disabled pool support
 # Note: Extensions add methods to _get_pool_for_backend(::Val{:backend}) directly
 
-# Core data structures
-include("types.jl")
-
-# Debugging & validation utilities (needed by macros)
-include("utils.jl")
-
-# Acquisition operations: get_view!, acquire!, unsafe_acquire!, aliases
-include("acquire.jl")
-
-# BitArray-specific acquisition (SIMD-optimized BitVector operations)
-include("bitarray.jl")
-
-# Convenience functions: zeros!, ones!, similar!
-include("convenience.jl")
-
-# State management: checkpoint!, rewind!, reset!, empty!
-include("state.jl")
-
-# Task-local pool
-include("task_local_pool.jl")
-
-# Macros: @with_pool, @maybe_with_pool
-include("macros.jl")
+# All includes grouped under a single version branch
+@static if VERSION >= v"1.11-"
+    include("types.jl")
+    include("utils.jl")
+    include("acquire.jl")
+    include("bitarray.jl")
+    include("convenience.jl")
+    include("state.jl")
+    include("task_local_pool.jl")
+    include("macros.jl")
+else
+    include("legacy/types.jl")
+    include("utils.jl")
+    include("legacy/acquire.jl")
+    include("legacy/bitarray.jl")
+    include("convenience.jl")
+    include("legacy/state.jl")
+    include("task_local_pool.jl")
+    include("macros.jl")
+end
 
 end # module
diff --git a/src/acquire.jl b/src/acquire.jl
index b0326f1a..b9c26118 100644
--- a/src/acquire.jl
+++ b/src/acquire.jl
@@ -91,60 +91,80 @@ function get_view!(tp::AbstractTypedPool{T}, n::Int) where {T}
 end
 
 # ==============================================================================
-# Get N-D Array/View (Internal - Zero-Allocation Cache)
+# Get N-D Array (setfield!-based Wrapper Reuse, Julia 1.11+)
 # ==============================================================================
+#
+# Julia 1.11+ changed Array to mutable struct {ref::MemoryRef{T}, size::NTuple{N,Int}},
+# enabling in-place mutation via setfield!. This eliminates N-way cache eviction limits:
+# unlimited dimension patterns per slot, 0-alloc after warmup for any dims with same N.
 
 """
-    get_nd_array!(tp::AbstractTypedPool{T}, dims::NTuple{N,Int}) -> Array{T,N}
+    _store_nd_wrapper!(tp::AbstractTypedPool, N::Int, slot::Int, wrapper)
 
-Get an N-dimensional `Array` from the pool with N-way caching.
+Store a cached N-D wrapper for the given slot. Creates the per-N Vector if needed.
 """
-@inline function get_nd_array!(tp::AbstractTypedPool{T}, dims::NTuple{N, Int}) where {T, N}
-    total_len = safe_prod(dims)
-    flat_view = get_view!(tp, total_len) # Increments n_active
-    slot = tp.n_active
-
-    @inbounds vec = tp.vectors[slot]
-    current_ptr = UInt(pointer(vec))
-
-    # Expand cache slots if needed (CACHE_WAYS entries per slot)
-    n_slots_cached = length(tp.nd_next_way)
-    while slot > n_slots_cached
-        for _ in 1:CACHE_WAYS
-            push!(tp.nd_arrays, nothing)
-            push!(tp.nd_dims, nothing)
-            push!(tp.nd_ptrs, UInt(0))
+function _store_nd_wrapper!(tp::AbstractTypedPool, N::Int, slot::Int, wrapper)
+    # Grow nd_wrappers vector so index N is valid
+    if N > length(tp.nd_wrappers)
+        old_len = length(tp.nd_wrappers)
+        resize!(tp.nd_wrappers, N)
+        for i in (old_len+1):N
+            @inbounds tp.nd_wrappers[i] = nothing
         end
-        push!(tp.nd_next_way, 0)
-        n_slots_cached += 1
     end
-
-    base = (slot - 1) * CACHE_WAYS
-
-    # Linear Search across all ways (Cache hit = 0 bytes)
-    for k in 1:CACHE_WAYS
-        cache_idx = base + k
-        @inbounds cached_dims = tp.nd_dims[cache_idx]
-        @inbounds cached_ptr = tp.nd_ptrs[cache_idx]
-
-        if cached_dims isa NTuple{N, Int} && cached_dims == dims && cached_ptr == current_ptr
-            return @inbounds tp.nd_arrays[cache_idx]::Array{T, N}
+    wrappers = @inbounds tp.nd_wrappers[N]
+    if wrappers === nothing
+        wrappers = Vector{Any}(nothing, slot)
+        @inbounds tp.nd_wrappers[N] = wrappers
+    elseif slot > length(wrappers)
+        old_len = length(wrappers)
+        resize!(wrappers, slot)
+        for i in (old_len+1):slot
+            @inbounds wrappers[i] = nothing
         end
     end
+    @inbounds wrappers[slot] = wrapper
+    nothing
+end
 
-    # Cache Miss - Round-Robin Replacement
-    @inbounds way_offset = tp.nd_next_way[slot]
-    target_idx = base + way_offset + 1
+"""
+    get_nd_array!(tp::AbstractTypedPool{T,Vector{T}}, dims::NTuple{N,Int}) -> Array{T,N}
 
-    arr = wrap_array(tp, flat_view, dims)
+Get an N-dimensional `Array` from the pool with `setfield!`-based wrapper reuse.
 
-    @inbounds tp.nd_arrays[target_idx] = arr
-    @inbounds tp.nd_dims[target_idx] = dims
-    @inbounds tp.nd_ptrs[target_idx] = current_ptr
+Uses Julia 1.11+ `setfield!` to mutate cached `Array` wrappers in-place:
+- Same N (dimensionality): `setfield!(arr, :size, dims)` — 0 allocation
+- Backing memory: `setfield!(arr, :ref, ...)` — always updated, 0 allocation in compiled code
+- First call per (slot, N): `unsafe_wrap` once, then cached forever
 
-    # Update round-robin counter
-    @inbounds tp.nd_next_way[slot] = (way_offset + 1) % CACHE_WAYS
+Unlike the N-way cache (Julia 1.10), this has no eviction limit — unlimited dimension
+patterns per slot are supported with zero allocation after warmup.
+"""
+@inline function get_nd_array!(tp::AbstractTypedPool{T, Vector{T}}, dims::NTuple{N, Int}) where {T, N}
+    total_len = safe_prod(dims)
+    flat_view = get_view!(tp, total_len) # Increments n_active, ensures backing vec
+    slot = tp.n_active
+    @inbounds vec = tp.vectors[slot]
 
+    # Look up cached wrapper for this dimensionality (direct index, no hash)
+    wrappers = N <= length(tp.nd_wrappers) ? (@inbounds tp.nd_wrappers[N]) : nothing
+    if wrappers !== nothing && slot <= length(wrappers)
+        wrapper = @inbounds wrappers[slot]
+        if wrapper !== nothing
+            arr = wrapper::Array{T, N}
+            # Always update ref: resize! can grow in-place without changing pointer,
+            # but the old MemoryRef still has the old (smaller) Memory length.
+            # setfield!(:ref) is 0-alloc in compiled code (only 32B at REPL top-level).
+            setfield!(arr, :ref, getfield(vec, :ref))
+            # Update dimensions (0-alloc: NTuple stored inline in mutable Array)
+            setfield!(arr, :size, dims)
+            return arr
+        end
+    end
+
+    # Cache miss: first call for this (slot, N) — unsafe_wrap once
+    arr = wrap_array(tp, flat_view, dims)
+    _store_nd_wrapper!(tp, N, slot, arr)
     return arr
 end
 
diff --git a/src/bitarray.jl b/src/bitarray.jl
index eb9addf9..8e9e4ab3 100644
--- a/src/bitarray.jl
+++ b/src/bitarray.jl
@@ -1,13 +1,13 @@
 # ==============================================================================
-# BitArray Acquisition (N-D Cached BitArray API)
+# BitArray Acquisition — Julia 1.11+ (setfield!-based Wrapper Reuse)
 # ==============================================================================
 #
-# This file contains BitArray-specific pool operations, separated from the
-# generic Array acquisition code in acquire.jl for maintainability.
+# This file contains BitArray-specific pool operations for Julia 1.11+.
+# Uses setfield!-based wrapper reuse for unlimited dim patterns, 0-alloc.
 #
 # Key components:
 # - Base.zero/one(::Type{Bit}) - Fill value dispatch for Bit sentinel type
-# - get_bitarray! - N-D BitArray with shared chunks and N-way caching
+# - get_bitarray! - N-D BitArray with setfield!-based caching
 # - _acquire_impl! for Bit - Delegates to _unsafe_acquire_impl! for performance
 # - _unsafe_acquire_impl! for Bit - Raw BitArray acquisition with caching
 # - DisabledPool fallbacks for Bit type
@@ -15,27 +15,8 @@
 # Design Decision: Unified BitArray Return Type
 # =============================================
 # Unlike regular types where acquire! returns SubArray and unsafe_acquire!
-# returns Array, for Bit type BOTH return BitArray{N}. This design choice is
-# intentional for several reasons:
-#
-# 1. **SIMD Performance**: BitArray operations like `count()`, `sum()`, and
-#    bitwise operations are ~(10x ~ 100x) faster than their SubArray equivalents
-#    because they use SIMD-optimized chunked algorithms.
-#
-# 2. **API Simplicity**: Users always get BitArray regardless of which API
-#    they call. No need to remember "use unsafe_acquire! for performance".
-#
-# 3. **N-D Caching**: BitArray{N} can be reused by modifying dims/len fields
-#    when ndims matches, achieving 0 allocation on repeated calls. This is
-#    unique to BitArray - regular Array cannot modify dims in place.
-#
-# 4. **Backwards Compatibility**: Code using trues!/falses! just works with
-#    optimal performance - these convenience functions return BitVector.
-#
-# Implementation:
-# - _acquire_impl!(pool, Bit, ...) delegates to _unsafe_acquire_impl!
-# - get_bitarray! creates BitArray shells sharing pool's chunks
-# - N-way cache stores BitArray{N} entries, reused via dims modification
+# returns Array, for Bit type BOTH return BitArray{N}. This ensures users
+# always get SIMD-optimized performance (~10-100x faster count/sum/bitwise).
 # ==============================================================================
 
 # ==============================================================================
@@ -55,14 +36,7 @@
 
 Get a BitArray{N} that shares `chunks` with the pooled BitVector.
 
-Uses N-way cache for BitArray reuse. Unlike Array which requires unsafe_wrap
-for each shape, BitArray can reuse cached entries by modifying `dims`/`len`
-fields when ndims matches (0 bytes allocation).
-
-## Cache Strategy
-- **Exact match**: Return cached BitArray directly (0 bytes)
-- **Same ndims**: Modify dims/len/chunks of cached entry (0 bytes)
-- **Different ndims**: Create new BitArray{N} and cache it (~944 bytes)
+Uses `setfield!`-based wrapper reuse — unlimited dim patterns, 0-alloc after warmup.
 
 ## Implementation Notes
 - BitVector (N=1): `size()` uses `len` field, `dims` is ignored
@@ -87,19 +61,8 @@ function get_bitarray!(tp::BitTypedPool, dims::NTuple{N,Int}) where {N}
         ba = BitArray{N}(undef, dims)
         ba.chunks = pool_bv.chunks
 
-        # Expand N-way cache (CACHE_WAYS entries per slot)
-        for _ in 1:CACHE_WAYS
-            push!(tp.nd_arrays, nothing)
-            push!(tp.nd_dims, nothing)
-            push!(tp.nd_ptrs, UInt(0))
-        end
-        push!(tp.nd_next_way, 0)
-
-        # Cache in first way
-        base = (idx - 1) * CACHE_WAYS + 1
-        @inbounds tp.nd_arrays[base] = ba
-        @inbounds tp.nd_dims[base] = dims
-        @inbounds tp.nd_ptrs[base] = UInt(pointer(pool_bv.chunks))
+        # Cache the wrapper
+        _store_nd_wrapper!(tp, N, idx, ba)
 
         # Warn at powers of 2 (possible missing rewind!)
         if idx >= 512 && (idx & (idx - 1)) == 0
@@ -115,44 +78,25 @@ function get_bitarray!(tp::BitTypedPool, dims::NTuple{N,Int}) where {N}
     if length(pool_bv) != total_len
         resize!(pool_bv, total_len)
     end
-    current_ptr = UInt(pointer(pool_bv.chunks))
-    base = (idx - 1) * CACHE_WAYS
-
-    # 3. Check N-way cache for hit
-    for k in 1:CACHE_WAYS
-        cache_idx = base + k
-        @inbounds cached_dims = tp.nd_dims[cache_idx]
-        @inbounds cached_ptr = tp.nd_ptrs[cache_idx]
-
-        # Must check isa FIRST for type stability (avoids boxing in == comparison)
-        if cached_dims isa NTuple{N,Int} && cached_ptr == current_ptr
-            if cached_dims == dims
-                # Exact match - return cached BitArray directly (0 alloc)
-                return @inbounds tp.nd_arrays[cache_idx]::BitArray{N}
-            else
-                # Same ndims but different dims - reuse by modifying fields (0 alloc!)
-                ba = @inbounds tp.nd_arrays[cache_idx]::BitArray{N}
-                ba.len = total_len
-                ba.dims = dims
-                ba.chunks = pool_bv.chunks
-                # Update cache metadata
-                @inbounds tp.nd_dims[cache_idx] = dims
-                return ba
-            end
+
+    # 3. Check wrapper cache (direct index, no hash)
+    wrappers = N <= length(tp.nd_wrappers) ? (@inbounds tp.nd_wrappers[N]) : nothing
+    if wrappers !== nothing && idx <= length(wrappers)
+        wrapper = @inbounds wrappers[idx]
+        if wrapper !== nothing
+            ba = wrapper::BitArray{N}
+            # Update fields in-place (all 0-alloc via setfield!)
+            setfield!(ba, :len, total_len)
+            setfield!(ba, :dims, dims)
+            setfield!(ba, :chunks, pool_bv.chunks)
+            return ba
         end
     end
 
-    # 4. Cache miss - create new BitArray{N}
+    # 4. Cache miss: first call for this (slot, N)
     ba = BitArray{N}(undef, dims)
     ba.chunks = pool_bv.chunks
-
-    # Round-robin replacement
-    @inbounds way_offset = tp.nd_next_way[idx]
-    target_idx = base + way_offset + 1
-    @inbounds tp.nd_arrays[target_idx] = ba
-    @inbounds tp.nd_dims[target_idx] = dims
-    @inbounds tp.nd_ptrs[target_idx] = current_ptr
-    @inbounds tp.nd_next_way[idx] = (way_offset + 1) % CACHE_WAYS
+    _store_nd_wrapper!(tp, N, idx, ba)
 
     return ba
 end
diff --git a/src/legacy/acquire.jl b/src/legacy/acquire.jl
new file mode 100644
index 00000000..eb47ab03
--- /dev/null
+++ b/src/legacy/acquire.jl
@@ -0,0 +1,479 @@
+# ==============================================================================
+# Allocation Dispatch Points (for extensibility)
+# ==============================================================================
+
+# Allocate a new vector (dispatch point for extensions)
+@inline allocate_vector(::AbstractTypedPool{T,Vector{T}}, n::Int) where {T} =
+    Vector{T}(undef, n)
+
+# Wrap flat view into N-D array (dispatch point for extensions)
+@inline function wrap_array(::AbstractTypedPool{T,Vector{T}},
+                            flat_view, dims::NTuple{N,Int}) where {T,N}
+    unsafe_wrap(Array{T,N}, pointer(flat_view), dims)
+end
+
+# ==============================================================================
+# Helper: Overflow-Safe Product
+# ==============================================================================
+
+"""
+    safe_prod(dims::NTuple{N, Int}) -> Int
+
+Compute the product of dimensions with overflow checking.
+
+Throws `OverflowError` if the product exceeds `typemax(Int)`, preventing
+memory corruption from integer overflow in `unsafe_wrap` operations.
+
+## Performance
+Adds ~0.3-1.2 ns overhead (<1%) compared to unchecked `prod()`, which is
+negligible relative to the 100-200 ns cost of the full allocation path.
+"""
+@inline function safe_prod(dims::NTuple{N, Int}) where {N}
+    total = 1
+    for d in dims
+        total = Base.checked_mul(total, d)
+    end
+    return total
+end
+
+# ==============================================================================
+# Get 1D View (Internal - Zero-Allocation Cache)
+# ==============================================================================
+
+"""
+    get_view!(tp::AbstractTypedPool{T}, n::Int)
+
+Get a 1D vector view of size `n` from the typed pool.
+Returns cached view on hit (zero allocation), creates new on miss.
+"""
+function get_view!(tp::AbstractTypedPool{T}, n::Int) where {T}
+    tp.n_active += 1
+    idx = tp.n_active
+
+    # 1. Need to expand pool (new slot)
+    if idx > length(tp.vectors)
+        push!(tp.vectors, allocate_vector(tp, n))
+        new_view = view(tp.vectors[idx], 1:n)
+        push!(tp.views, new_view)
+        push!(tp.view_lengths, n)
+
+        # Warn at powers of 2 (512, 1024, 2048, ...) - possible missing rewind!()
+        if idx >= 512 && (idx & (idx - 1)) == 0
+            total_bytes = sum(length, tp.vectors) * sizeof(T)
+            @warn "$(nameof(typeof(tp))){$T} growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?"
+        end
+
+        return new_view
+    end
+
+    # 2. Cache hit: same size requested -> return cached view (ZERO ALLOC)
+    @inbounds cached_len = tp.view_lengths[idx]
+    if cached_len == n
+        return @inbounds tp.views[idx]
+    end
+
+    # 3. Cache miss: different size -> update cache
+    @inbounds vec = tp.vectors[idx]
+    if length(vec) < n
+        resize!(vec, n)
+    end
+
+    new_view = view(vec, 1:n)
+    @inbounds tp.views[idx] = new_view
+    @inbounds tp.view_lengths[idx] = n
+
+    return new_view
+end
+
+# ==============================================================================
+# Get N-D Array (N-way Set-Associative Cache, Julia ≤1.10)
+# ==============================================================================
+#
+# On Julia ≤1.10, Array is not a mutable struct, so setfield! cannot be used.
+# This provides the N-way cache that stores up to CACHE_WAYS different
+# (dims, pointer) patterns per slot via round-robin replacement.
+
+"""
+    get_nd_array!(tp::AbstractTypedPool{T}, dims::NTuple{N,Int}) -> Array{T,N}
+
+Get an N-dimensional `Array` from the pool with N-way caching.
+
+Uses a set-associative cache with `CACHE_WAYS` entries per slot (default: 4).
+Cache hit (exact dims + pointer match) returns the cached Array at zero cost.
+Cache miss creates a new `unsafe_wrap`'d Array (~96 bytes) and stores it via
+round-robin replacement.
+"""
+@inline function get_nd_array!(tp::AbstractTypedPool{T}, dims::NTuple{N, Int}) where {T, N}
+    total_len = safe_prod(dims)
+    flat_view = get_view!(tp, total_len) # Increments n_active
+    slot = tp.n_active
+
+    @inbounds vec = tp.vectors[slot]
+    current_ptr = UInt(pointer(vec))
+
+    # Expand cache slots if needed (CACHE_WAYS entries per slot)
+    n_slots_cached = length(tp.nd_next_way)
+    while slot > n_slots_cached
+        for _ in 1:CACHE_WAYS
+            push!(tp.nd_arrays, nothing)
+            push!(tp.nd_dims, nothing)
+            push!(tp.nd_ptrs, UInt(0))
+        end
+        push!(tp.nd_next_way, 0)
+        n_slots_cached += 1
+    end
+
+    base = (slot - 1) * CACHE_WAYS
+
+    # Linear Search across all ways (Cache hit = 0 bytes)
+    for k in 1:CACHE_WAYS
+        cache_idx = base + k
+        @inbounds cached_dims = tp.nd_dims[cache_idx]
+        @inbounds cached_ptr = tp.nd_ptrs[cache_idx]
+
+        if cached_dims isa NTuple{N, Int} && cached_dims == dims && cached_ptr == current_ptr
+            return @inbounds tp.nd_arrays[cache_idx]::Array{T, N}
+        end
+    end
+
+    # Cache Miss - Round-Robin Replacement
+    @inbounds way_offset = tp.nd_next_way[slot]
+    target_idx = base + way_offset + 1
+
+    arr = wrap_array(tp, flat_view, dims)
+
+    @inbounds tp.nd_arrays[target_idx] = arr
+    @inbounds tp.nd_dims[target_idx] = dims
+    @inbounds tp.nd_ptrs[target_idx] = current_ptr
+
+    # Update round-robin counter
+    @inbounds tp.nd_next_way[slot] = (way_offset + 1) % CACHE_WAYS
+
+    return arr
+end
+
+"""
+    get_nd_view!(tp::AbstractTypedPool{T}, dims::NTuple{N,Int})
+
+Get an N-dimensional view via `reshape` (zero creation cost).
+"""
+@inline function get_nd_view!(tp::AbstractTypedPool{T}, dims::NTuple{N, Int}) where {T, N}
+    total_len = safe_prod(dims)
+    flat_view = get_view!(tp, total_len)  # 1D view (cached, 0 alloc)
+    return reshape(flat_view, dims)        # ReshapedArray (0 creation cost)
+end
+
+# ==============================================================================
+# Type Touch Recording (for selective rewind)
+# ==============================================================================
+
+"""
+    _record_type_touch!(pool::AbstractArrayPool, ::Type{T})
+
+Record that type `T` was touched (acquired) at the current checkpoint depth.
+Called by `acquire!` and convenience wrappers; macro-transformed calls use
+`_acquire_impl!` directly (bypassing this for zero overhead).
+
+For fixed-slot types, sets the corresponding bit in `_touched_type_masks`.
+For non-fixed-slot types, sets `_touched_has_others` flag.
+"""
+@inline function _record_type_touch!(pool::AbstractArrayPool, ::Type{T}) where {T}
+    depth = pool._current_depth
+    b = _fixed_slot_bit(T)
+    if b == UInt16(0)
+        @inbounds pool._touched_has_others[depth] = true
+    else
+        @inbounds pool._touched_type_masks[depth] |= b
+    end
+    nothing
+end
+
+# CPU-specific override: adds lazy first-touch checkpoint in lazy mode
+# and typed-lazy mode.
+# _LAZY_MODE_BIT (bit 15) in _touched_type_masks[depth]  ↔  depth entered via _lazy_checkpoint!
+# _TYPED_LAZY_BIT (bit 14) in _touched_type_masks[depth]  ↔  depth entered via _typed_lazy_checkpoint!
+# On the first acquire of each fixed-slot type T at that depth, we retroactively save
+# n_active BEFORE the acquire (current value is still the parent's count), so that
+# the subsequent rewind can restore the parent's state correctly.
+@inline function _record_type_touch!(pool::AdaptiveArrayPool, ::Type{T}) where {T}
+    depth = pool._current_depth
+    b = _fixed_slot_bit(T)
+    if b == UInt16(0)
+        @inbounds pool._touched_has_others[depth] = true
+    else
+        current_mask = @inbounds pool._touched_type_masks[depth]
+        # Lazy checkpoint: lazy mode (bit 15) OR typed lazy mode (bit 14), AND first touch.
+        # Guard: skip if already checkpointed at this depth (prevents double-push when a
+        # tracked type is also acquired by a helper via acquire! → _record_type_touch!).
+        if (current_mask & _MODE_BITS_MASK) != 0 && (current_mask & b) == 0
+            tp = get_typed_pool!(pool, T)
+            if @inbounds(tp._checkpoint_depths[end]) != depth
+                _checkpoint_typed_pool!(tp, depth)
+            end
+        end
+        @inbounds pool._touched_type_masks[depth] = current_mask | b
+    end
+    nothing
+end
+
+# ==============================================================================
+# Internal Implementation Functions (called by macro-transformed code)
+# ==============================================================================
+
+"""
+    _acquire_impl!(pool, Type{T}, n) -> SubArray{T,1,Vector{T},...}
+    _acquire_impl!(pool, Type{T}, dims...) -> ReshapedArray{T,N,...}
+
+Internal implementation of acquire!. Called directly by macro-transformed code
+(no type touch recording). User code calls `acquire!` which adds recording.
+"""
+@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{T}, n::Int) where {T}
+    tp = get_typed_pool!(pool, T)
+    return get_view!(tp, n)
+end
+
+@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N}
+    tp = get_typed_pool!(pool, T)
+    return get_nd_view!(tp, dims)
+end
+
+@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N}
+    _acquire_impl!(pool, T, dims...)
+end
+
+# Similar-style
+@inline _acquire_impl!(pool::AbstractArrayPool, x::AbstractArray) = _acquire_impl!(pool, eltype(x), size(x))
+
+"""
+    _unsafe_acquire_impl!(pool, Type{T}, dims...) -> Array{T,N}
+
+Internal implementation of unsafe_acquire!. Called directly by macro-transformed code.
+"""
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{T}, n::Int) where {T}
+    tp = get_typed_pool!(pool, T)
+    return get_nd_array!(tp, (n,))
+end
+
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N}
+    tp = get_typed_pool!(pool, T)
+    return get_nd_array!(tp, dims)
+end
+
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N}
+    tp = get_typed_pool!(pool, T)
+    return get_nd_array!(tp, dims)
+end
+
+# Similar-style
+@inline _unsafe_acquire_impl!(pool::AbstractArrayPool, x::AbstractArray) = _unsafe_acquire_impl!(pool, eltype(x), size(x))
+
+# ==============================================================================
+# Acquisition API (User-facing with type touch recording)
+# ==============================================================================
+
+"""
+    acquire!(pool, Type{T}, n) -> view type
+    acquire!(pool, Type{T}, dims...) -> view type
+    acquire!(pool, Type{T}, dims::NTuple{N,Int}) -> view type
+
+Acquire a pooled array of type `T` with size `n` or dimensions `dims`.
+
+Returns a pooled array (backend-dependent type):
+- **CPU 1D**: `SubArray{T,1,Vector{T},...}` (parent is `Vector{T}`)
+- **CPU N-D**: `ReshapedArray{T,N,...}` (zero creation cost)
+- **Bit** (`T === Bit`): `BitVector` / `BitArray{N}` (chunks-sharing, SIMD optimized)
+- **CUDA**: `CuArray{T,N}` (unified N-way cache)
+
+For CPU numeric arrays, the return types are `StridedArray`, compatible with
+BLAS and broadcasting.
+
+For type-unspecified paths (struct fields without concrete type parameters),
+use [`unsafe_acquire!`](@ref) instead - cached native array instances can be reused.
+
+## Example
+```julia
+@with_pool pool begin
+    v = acquire!(pool, Float64, 100)      # 1D view
+    m = acquire!(pool, Float64, 10, 10)   # 2D view
+    v .= 1.0
+    m .= 2.0
+    sum(v) + sum(m)
+end
+```
+
+See also: [`unsafe_acquire!`](@ref) for native array access.
+"""
+@inline function acquire!(pool::AbstractArrayPool, ::Type{T}, n::Int) where {T}
+    _record_type_touch!(pool, T)
+    _acquire_impl!(pool, T, n)
+end
+
+# Multi-dimensional support (zero-allocation with N-D cache)
+@inline function acquire!(pool::AbstractArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N}
+    _record_type_touch!(pool, T)
+    _acquire_impl!(pool, T, dims...)
+end
+
+# Tuple support: allows acquire!(pool, T, size(A)) where size(A) returns NTuple{N,Int}
+@inline function acquire!(pool::AbstractArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N}
+    _record_type_touch!(pool, T)
+    _acquire_impl!(pool, T, dims...)
+end
+
+# Similar-style convenience methods
+"""
+    acquire!(pool, x::AbstractArray) -> SubArray
+
+Acquire an array with the same element type and size as `x` (similar to `similar(x)`).
+
+## Example
+```julia
+A = rand(10, 10)
+@with_pool pool begin
+    B = acquire!(pool, A)  # Same type and size as A
+    B .= A .* 2
+end
+```
+"""
+@inline function acquire!(pool::AbstractArrayPool, x::AbstractArray)
+    _record_type_touch!(pool, eltype(x))
+    _acquire_impl!(pool, eltype(x), size(x))
+end
+
+# ==============================================================================
+# Unsafe Acquisition API (Raw Arrays)
+# ==============================================================================
+
+"""
+    unsafe_acquire!(pool, Type{T}, n) -> backend's native array type
+    unsafe_acquire!(pool, Type{T}, dims...) -> backend's native array type
+    unsafe_acquire!(pool, Type{T}, dims::NTuple{N,Int}) -> backend's native array type
+
+Acquire a native array backed by pool memory.
+
+Returns the backend's native array type:
+- **CPU**: `Array{T,N}` (via `unsafe_wrap`)
+- **Bit** (`T === Bit`): `BitVector` / `BitArray{N}` (chunks-sharing; equivalent to `acquire!`)
+- **CUDA**: `CuArray{T,N}` (via unified view cache)
+
+## Safety Warning
+The returned array is only valid within the `@with_pool` scope. Using it after
+the scope ends leads to undefined behavior (use-after-free, data corruption).
+
+**Do NOT call `resize!`, `push!`, or `append!` on returned arrays** - this causes
+undefined behavior as the memory is owned by the pool.
+
+## When to Use
+- **Type-unspecified paths**: Struct fields without concrete type parameters
+- FFI calls expecting raw pointers
+- APIs that strictly require native array types
+
+## Example
+```julia
+@with_pool pool begin
+    A = unsafe_acquire!(pool, Float64, 100, 100)  # Matrix{Float64}
+    B = unsafe_acquire!(pool, Float64, 100, 100)
+    C = similar(A)  # Regular allocation for result
+    mul!(C, A, B)   # BLAS uses A, B directly
+end
+# A and B are INVALID after this point!
+```
+
+See also: [`acquire!`](@ref) for view-based access.
+"""
+@inline function unsafe_acquire!(pool::AbstractArrayPool, ::Type{T}, n::Int) where {T}
+    _record_type_touch!(pool, T)
+    _unsafe_acquire_impl!(pool, T, n)
+end
+
+@inline function unsafe_acquire!(pool::AbstractArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N}
+    _record_type_touch!(pool, T)
+    _unsafe_acquire_impl!(pool, T, dims...)
+end
+
+# Tuple support
+@inline function unsafe_acquire!(pool::AbstractArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N}
+    _record_type_touch!(pool, T)
+    _unsafe_acquire_impl!(pool, T, dims)
+end
+
+# Similar-style convenience methods
+"""
+    unsafe_acquire!(pool, x::AbstractArray) -> Array
+
+Acquire a raw array with the same element type and size as `x` (similar to `similar(x)`).
+
+## Example
+```julia
+A = rand(10, 10)
+@with_pool pool begin
+    B = unsafe_acquire!(pool, A)  # Matrix{Float64}, same size as A
+    B .= A .* 2
+end
+```
+"""
+@inline function unsafe_acquire!(pool::AbstractArrayPool, x::AbstractArray)
+    _record_type_touch!(pool, eltype(x))
+    _unsafe_acquire_impl!(pool, eltype(x), size(x))
+end
+
+# ==============================================================================
+# API Aliases
+# ==============================================================================
+
+"""
+    acquire_view!(pool, Type{T}, dims...)
+
+Alias for [`acquire!`](@ref).
+
+Explicit name emphasizing the return type is a view (`SubArray`/`ReshapedArray`),
+not a raw `Array`. Use when you prefer symmetric naming with `acquire_array!`.
+"""
+const acquire_view! = acquire!
+
+"""
+    acquire_array!(pool, Type{T}, dims...)
+
+Alias for [`unsafe_acquire!`](@ref).
+
+Explicit name emphasizing the return type is a raw `Array`.
+Use when you prefer symmetric naming with `acquire_view!`.
+"""
+const acquire_array! = unsafe_acquire!
+
+# Internal implementation aliases (for macro transformation)
+const _acquire_view_impl! = _acquire_impl!
+const _acquire_array_impl! = _unsafe_acquire_impl!
+
+# ==============================================================================
+# DisabledPool Acquire Fallbacks (pooling disabled with backend context)
+# ==============================================================================
+
+# --- acquire! for DisabledPool{:cpu} ---
+@inline acquire!(::DisabledPool{:cpu}, ::Type{T}, n::Int) where {T} = Vector{T}(undef, n)
+@inline acquire!(::DisabledPool{:cpu}, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = Array{T,N}(undef, dims)
+@inline acquire!(::DisabledPool{:cpu}, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = Array{T,N}(undef, dims)
+@inline acquire!(::DisabledPool{:cpu}, x::AbstractArray) = similar(x)
+
+# --- unsafe_acquire! for DisabledPool{:cpu} ---
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{T}, n::Int) where {T} = Vector{T}(undef, n)
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = Array{T,N}(undef, dims)
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = Array{T,N}(undef, dims)
+@inline unsafe_acquire!(::DisabledPool{:cpu}, x::AbstractArray) = similar(x)
+
+# --- Generic DisabledPool fallbacks (unknown backend → error) ---
+@inline acquire!(::DisabledPool{B}, _args...) where {B} = _throw_backend_not_loaded(B)
+@inline unsafe_acquire!(::DisabledPool{B}, _args...) where {B} = _throw_backend_not_loaded(B)
+
+# --- _impl! delegators for DisabledPool (macro transformation support) ---
+# Called when: USE_POOLING=true + @maybe_with_pool + MAYBE_POOLING_ENABLED[]=false
+# Explicit overloads for proper inlining (especially important for CUDA backend).
+@inline _acquire_impl!(p::DisabledPool, ::Type{T}, n::Int) where {T} = acquire!(p, T, n)
+@inline _acquire_impl!(p::DisabledPool, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = acquire!(p, T, dims...)
+@inline _acquire_impl!(p::DisabledPool, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = acquire!(p, T, dims)
+@inline _acquire_impl!(p::DisabledPool, x::AbstractArray) = acquire!(p, x)
+
+@inline _unsafe_acquire_impl!(p::DisabledPool, ::Type{T}, n::Int) where {T} = unsafe_acquire!(p, T, n)
+@inline _unsafe_acquire_impl!(p::DisabledPool, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = unsafe_acquire!(p, T, dims...)
+@inline _unsafe_acquire_impl!(p::DisabledPool, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = unsafe_acquire!(p, T, dims)
+@inline _unsafe_acquire_impl!(p::DisabledPool, x::AbstractArray) = unsafe_acquire!(p, x)
diff --git a/src/legacy/bitarray.jl b/src/legacy/bitarray.jl
new file mode 100644
index 00000000..97731f68
--- /dev/null
+++ b/src/legacy/bitarray.jl
@@ -0,0 +1,205 @@
+# ==============================================================================
+# BitArray Acquisition — Legacy (Julia ≤1.10, N-way Set-Associative Cache)
+# ==============================================================================
+#
+# This file contains BitArray-specific pool operations for Julia ≤1.10.
+# Uses N-way set-associative cache for N-D BitArray caching.
+#
+# Key components:
+# - Base.zero/one(::Type{Bit}) - Fill value dispatch for Bit sentinel type
+# - get_bitarray! - N-D BitArray with shared chunks and N-way caching
+# - _acquire_impl! for Bit - Delegates to _unsafe_acquire_impl! for performance
+# - _unsafe_acquire_impl! for Bit - Raw BitArray acquisition with caching
+# - DisabledPool fallbacks for Bit type
+#
+# Design Decision: Unified BitArray Return Type
+# =============================================
+# Unlike regular types where acquire! returns SubArray and unsafe_acquire!
+# returns Array, for Bit type BOTH return BitArray{N}. This design choice is
+# intentional for several reasons:
+#
+# 1. **SIMD Performance**: BitArray operations like `count()`, `sum()`, and
+#    bitwise operations are ~(10x ~ 100x) faster than their SubArray equivalents
+#    because they use SIMD-optimized chunked algorithms.
+#
+# 2. **API Simplicity**: Users always get BitArray regardless of which API
+#    they call. No need to remember "use unsafe_acquire! for performance".
+#
+# 3. **N-D Caching**: BitArray{N} can be reused by modifying dims/len fields
+#    when ndims matches, achieving 0 allocation on repeated calls. This is
+#    unique to BitArray - regular Array cannot modify dims in place.
+#
+# 4. **Backwards Compatibility**: Code using trues!/falses! just works with
+#    optimal performance - these convenience functions return BitVector.
+#
+# Implementation:
+# - _acquire_impl!(pool, Bit, ...) delegates to _unsafe_acquire_impl!
+# - get_bitarray! creates BitArray shells sharing pool's chunks
+# - N-way cache stores BitArray{N} entries, reused via dims modification
+# ==============================================================================
+
+# ==============================================================================
+# Fill Value Dispatch (BitArray-specific)
+# ==============================================================================
+
+# Bit type returns Bool element type for fill operations (zero/one)
+@inline Base.zero(::Type{Bit}) = false
+@inline Base.one(::Type{Bit}) = true
+
+# ==============================================================================
+# BitArray Acquisition (N-way set-associative cache, Julia ≤1.10)
+# ==============================================================================
+
+"""
+    get_bitarray!(tp::BitTypedPool, dims::NTuple{N,Int}) -> BitArray{N}
+
+Get a BitArray{N} that shares `chunks` with the pooled BitVector.
+
+Uses N-way set-associative cache with up to CACHE_WAYS patterns per slot.
+
+## Implementation Notes
+- BitVector (N=1): `size()` uses `len` field, `dims` is ignored
+- BitArray{N>1}: `size()` uses `dims` field
+- All BitArrays share `chunks` with the pool's backing BitVector
+
+## Safety
+The returned BitArray is only valid within the `@with_pool` scope.
+Do NOT use after the scope ends (use-after-free risk).
+"""
+function get_bitarray!(tp::BitTypedPool, dims::NTuple{N,Int}) where {N}
+    total_len = safe_prod(dims)
+    tp.n_active += 1
+    idx = tp.n_active
+
+    # 1. Pool expansion needed (new slot)
+    if idx > length(tp.vectors)
+        pool_bv = BitVector(undef, total_len)
+        push!(tp.vectors, pool_bv)
+
+        # Create BitArray sharing chunks
+        ba = BitArray{N}(undef, dims)
+        ba.chunks = pool_bv.chunks
+
+        # Expand N-way cache (CACHE_WAYS entries per slot)
+        for _ in 1:CACHE_WAYS
+            push!(tp.nd_arrays, nothing)
+            push!(tp.nd_dims, nothing)
+            push!(tp.nd_ptrs, UInt(0))
+        end
+        push!(tp.nd_next_way, 0)
+
+        # Cache in first way
+        base = (idx - 1) * CACHE_WAYS + 1
+        @inbounds tp.nd_arrays[base] = ba
+        @inbounds tp.nd_dims[base] = dims
+        @inbounds tp.nd_ptrs[base] = UInt(pointer(pool_bv.chunks))
+
+        # Warn at powers of 2 (possible missing rewind!)
+        if idx >= 512 && (idx & (idx - 1)) == 0
+            total_bytes = sum(_vector_bytes, tp.vectors)
+            @warn "BitTypedPool growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?"
+        end
+
+        return ba
+    end
+
+    # 2. Ensure pool_bv has correct size
+    @inbounds pool_bv = tp.vectors[idx]
+    if length(pool_bv) != total_len
+        resize!(pool_bv, total_len)
+    end
+    current_ptr = UInt(pointer(pool_bv.chunks))
+    base = (idx - 1) * CACHE_WAYS
+
+    # 3. Check N-way cache for hit
+    for k in 1:CACHE_WAYS
+        cache_idx = base + k
+        @inbounds cached_dims = tp.nd_dims[cache_idx]
+        @inbounds cached_ptr = tp.nd_ptrs[cache_idx]
+
+        # Must check isa FIRST for type stability (avoids boxing in == comparison)
+        if cached_dims isa NTuple{N,Int} && cached_ptr == current_ptr
+            if cached_dims == dims
+                # Exact match - return cached BitArray directly (0 alloc)
+                return @inbounds tp.nd_arrays[cache_idx]::BitArray{N}
+            else
+                # Same ndims but different dims - reuse by modifying fields (0 alloc!)
+                ba = @inbounds tp.nd_arrays[cache_idx]::BitArray{N}
+                ba.len = total_len
+                ba.dims = dims
+                ba.chunks = pool_bv.chunks
+                # Update cache metadata
+                @inbounds tp.nd_dims[cache_idx] = dims
+                return ba
+            end
+        end
+    end
+
+    # 4. Cache miss - create new BitArray{N}
+    ba = BitArray{N}(undef, dims)
+    ba.chunks = pool_bv.chunks
+
+    # Round-robin replacement
+    @inbounds way_offset = tp.nd_next_way[idx]
+    target_idx = base + way_offset + 1
+    @inbounds tp.nd_arrays[target_idx] = ba
+    @inbounds tp.nd_dims[target_idx] = dims
+    @inbounds tp.nd_ptrs[target_idx] = current_ptr
+    @inbounds tp.nd_next_way[idx] = (way_offset + 1) % CACHE_WAYS
+
+    return ba
+end
+
+# Convenience: 1D case wraps to tuple
+@inline get_bitarray!(tp::BitTypedPool, n::Int) = get_bitarray!(tp, (n,))
+
+# ==============================================================================
+# Acquire Implementation (Bit type → delegates to unsafe_acquire for performance)
+# ==============================================================================
+
+# Bit type: delegates to _unsafe_acquire_impl! for SIMD performance
+@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int)
+    return _unsafe_acquire_impl!(pool, Bit, n)
+end
+
+@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N}
+    return _unsafe_acquire_impl!(pool, Bit, dims...)
+end
+
+@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N}
+    return _unsafe_acquire_impl!(pool, Bit, dims...)
+end
+
+# ==============================================================================
+# Unsafe Acquire Implementation (Bit type)
+# ==============================================================================
+
+# Bit type: returns BitArray{N} with shared chunks (SIMD optimized, N-D cached)
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int)
+    tp = get_typed_pool!(pool, Bit)::BitTypedPool
+    return get_bitarray!(tp, n)
+end
+
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N}
+    tp = get_typed_pool!(pool, Bit)::BitTypedPool
+    return get_bitarray!(tp, dims)
+end
+
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N}
+    tp = get_typed_pool!(pool, Bit)::BitTypedPool
+    return get_bitarray!(tp, dims)
+end
+
+# ==============================================================================
+# DisabledPool Fallbacks (Bit type)
+# ==============================================================================
+
+# --- acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) ---
+@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n)
+@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims)
+@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims)
+
+# --- unsafe_acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) ---
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n)
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims)
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims)
diff --git a/src/legacy/state.jl b/src/legacy/state.jl
new file mode 100644
index 00000000..1b512104
--- /dev/null
+++ b/src/legacy/state.jl
@@ -0,0 +1,640 @@
+# ==============================================================================
+# State Management — Legacy (Julia ≤1.10)
+# ==============================================================================
+# Identical to v1.11+ state.jl except empty! clears legacy N-way cache fields
+# (nd_arrays, nd_dims, nd_ptrs, nd_next_way) instead of nd_wrappers.
+
+# ==============================================================================
+# State Management - checkpoint!
+# ==============================================================================
+
+"""
+    checkpoint!(pool::AdaptiveArrayPool)
+
+Save the current pool state (n_active counters) to internal stacks.
+
+This is called automatically by `@with_pool` and related macros.
+After warmup, this function has **zero allocation**.
+
+See also: [`rewind!`](@ref), [`@with_pool`](@ref)
+"""
+function checkpoint!(pool::AdaptiveArrayPool)
+
+    # Increment depth and initialize type touch tracking state
+    pool._current_depth += 1
+    push!(pool._touched_type_masks, UInt16(0))
+    push!(pool._touched_has_others, false)
+    depth = pool._current_depth
+
+    # Fixed slots - zero allocation via @generated iteration
+    foreach_fixed_slot(pool) do tp
+        _checkpoint_typed_pool!(tp, depth)
+    end
+
+    # Others - iterate without allocation (values() returns iterator)
+    for p in values(pool.others)
+        _checkpoint_typed_pool!(p, depth)
+    end
+
+    return nothing
+end
+
+"""
+    checkpoint!(pool::AdaptiveArrayPool, ::Type{T})
+
+Save state for a specific type only. Used by optimized macros that know
+which types will be used at compile time.
+
+Also updates _current_depth and bitmask state for type touch tracking.
+
+~77% faster than full checkpoint! when only one type is used.
+"""
+@inline function checkpoint!(pool::AdaptiveArrayPool, ::Type{T}) where T
+
+    pool._current_depth += 1
+    push!(pool._touched_type_masks, UInt16(0))
+    # Push true when T is a fallback type (non-fixed-slot) so that
+    # _typed_lazy_rewind! iterates pool.others even if _acquire_impl!
+    # (which bypasses _record_type_touch!) is the only acquire path.
+    push!(pool._touched_has_others, _fixed_slot_bit(T) == UInt16(0))
+    _checkpoint_typed_pool!(get_typed_pool!(pool, T), pool._current_depth)
+    nothing
+end
+
+"""
+    checkpoint!(pool::AdaptiveArrayPool, types::Type...)
+
+Save state for multiple specific types. Uses @generated for zero-overhead
+compile-time unrolling. Increments _current_depth once for all types.
+"""
+@generated function checkpoint!(pool::AdaptiveArrayPool, types::Type...)
+
+    # Deduplicate types at compile time (e.g., Float64, Float64 → Float64)
+    seen = Set{Any}()
+    unique_indices = Int[]
+    for i in eachindex(types)
+        if !(types[i] in seen)
+            push!(seen, types[i])
+            push!(unique_indices, i)
+        end
+    end
+    # Check at compile time if any type is a fallback (non-fixed-slot).
+    # If so, push has_others=true so _typed_lazy_rewind! iterates pool.others
+    # even when _acquire_impl! (bypassing _record_type_touch!) is used.
+    has_any_fallback = any(i -> _fixed_slot_bit(types[i].parameters[1]) == UInt16(0), unique_indices)
+    checkpoint_exprs = [:(_checkpoint_typed_pool!(get_typed_pool!(pool, types[$i]), pool._current_depth)) for i in unique_indices]
+    quote
+        pool._current_depth += 1
+        push!(pool._touched_type_masks, UInt16(0))
+        push!(pool._touched_has_others, $has_any_fallback)
+        $(checkpoint_exprs...)
+        nothing
+    end
+end
+
+# Internal helper for checkpoint (works for any AbstractTypedPool)
+@inline function _checkpoint_typed_pool!(tp::AbstractTypedPool, depth::Int)
+
+    # Guard: skip if already checkpointed at this depth (prevents double-push
+    # when get_typed_pool! auto-checkpoints a new fallback type and then
+    # checkpoint!(pool, types...) calls _checkpoint_typed_pool! for the same type).
+    if @inbounds(tp._checkpoint_depths[end]) != depth
+        push!(tp._checkpoint_n_active, tp.n_active)
+        push!(tp._checkpoint_depths, depth)
+    end
+    nothing
+end
+
+"""
+    _lazy_checkpoint!(pool::AdaptiveArrayPool)
+
+Lightweight checkpoint for lazy mode (`use_typed=false` macro path).
+
+Increments `_current_depth` and pushes bitmask sentinels — but does **not** save
+`n_active` for any fixed-slot typed pool. The `_LAZY_MODE_BIT` (bit 15) in
+`_touched_type_masks` marks this depth as lazy mode so that
+`_record_type_touch!` can trigger lazy first-touch checkpoints.
+
+Existing `others` entries are eagerly checkpointed since there is no per-type
+tracking for non-fixed-slot pools; Case B in `_rewind_typed_pool!` handles any
+new `others` entries created during the scope (n_active starts at 0 = sentinel).
+
+Performance: ~2ns vs ~540ns for full `checkpoint!`.
+"""
+@inline function _lazy_checkpoint!(pool::AdaptiveArrayPool)
+
+    pool._current_depth += 1
+    # _LAZY_MODE_BIT = lazy mode flag (bits 0–7 are fixed-slot type bits)
+    push!(pool._touched_type_masks, _LAZY_MODE_BIT)
+    push!(pool._touched_has_others, false)
+    depth = pool._current_depth
+    # Eagerly checkpoint any pre-existing others entries.
+    # New others types created during the scope start at n_active=0 (sentinel covers them).
+    for p in values(pool.others)
+        _checkpoint_typed_pool!(p, depth)
+        @inbounds pool._touched_has_others[depth] = true
+    end
+    nothing
+end
+
+# ==============================================================================
+# State Management - rewind!
+# ==============================================================================
+
+"""
+    rewind!(pool::AdaptiveArrayPool)
+
+Restore the pool state (n_active counters) from internal stacks.
+Uses _checkpoint_depths to accurately determine which entries to pop vs restore.
+
+Only the counters are restored; allocated memory remains for reuse.
+Handles touched types by checking _checkpoint_depths for accurate restoration.
+
+**Safety**: If called at global scope (depth=1, no pending checkpoints),
+automatically delegates to `reset!` to safely clear all n_active counters.
+
+See also: [`checkpoint!`](@ref), [`reset!`](@ref), [`@with_pool`](@ref)
+"""
+function rewind!(pool::AdaptiveArrayPool)
+
+    cur_depth = pool._current_depth
+
+    # Safety guard: at global scope (depth=1), no checkpoint to rewind to
+    # Delegate to reset! which safely clears all n_active counters
+    if cur_depth == 1
+        reset!(pool)
+        return nothing
+    end
+
+    # Fixed slots - zero allocation via @generated iteration
+    foreach_fixed_slot(pool) do tp
+        _rewind_typed_pool!(tp, cur_depth)
+    end
+
+    # Process fallback types
+    for tp in values(pool.others)
+        _rewind_typed_pool!(tp, cur_depth)
+    end
+
+    pop!(pool._touched_type_masks)
+    pop!(pool._touched_has_others)
+    pool._current_depth -= 1
+
+    return nothing
+end
+
+"""
+    rewind!(pool::AdaptiveArrayPool, ::Type{T})
+
+Restore state for a specific type only.
+Also updates _current_depth and bitmask state.
+"""
+@inline function rewind!(pool::AdaptiveArrayPool, ::Type{T}) where T
+
+    # Safety guard: at global scope (depth=1), delegate to reset!
+    if pool._current_depth == 1
+        reset!(get_typed_pool!(pool, T))
+        return nothing
+    end
+    _rewind_typed_pool!(get_typed_pool!(pool, T), pool._current_depth)
+    pop!(pool._touched_type_masks)
+    pop!(pool._touched_has_others)
+    pool._current_depth -= 1
+    nothing
+end
+
+"""
+    rewind!(pool::AdaptiveArrayPool, types::Type...)
+
+Restore state for multiple specific types in reverse order.
+Decrements _current_depth once after all types are rewound.
+"""
+@generated function rewind!(pool::AdaptiveArrayPool, types::Type...)
+
+    # Deduplicate types at compile time (e.g., Float64, Float64 → Float64)
+    seen = Set{Any}()
+    unique_indices = Int[]
+    for i in eachindex(types)
+        if !(types[i] in seen)
+            push!(seen, types[i])
+            push!(unique_indices, i)
+        end
+    end
+    rewind_exprs = [:(_rewind_typed_pool!(get_typed_pool!(pool, types[$i]), pool._current_depth)) for i in reverse(unique_indices)]
+    reset_exprs = [:(reset!(get_typed_pool!(pool, types[$i]))) for i in unique_indices]
+    quote
+        # Safety guard: at global scope (depth=1), delegate to reset!
+        if pool._current_depth == 1
+            $(reset_exprs...)
+            return nothing
+        end
+        $(rewind_exprs...)
+        pop!(pool._touched_type_masks)
+        pop!(pool._touched_has_others)
+        pool._current_depth -= 1
+        nothing
+    end
+end
+
+# Internal helper for rewind with orphan cleanup (works for any AbstractTypedPool)
+# Uses 1-based sentinel pattern: no isempty checks needed (sentinel [0] guarantees non-empty)
+@inline function _rewind_typed_pool!(tp::AbstractTypedPool, current_depth::Int)
+
+    # 1. Orphaned Checkpoints Cleanup
+    # If there are checkpoints from deeper scopes (depth > current), pop them first.
+    # This happens when a nested scope did full checkpoint but typed rewind,
+    # leaving orphaned checkpoints that must be cleaned before finding current state.
+    while @inbounds tp._checkpoint_depths[end] > current_depth
+        pop!(tp._checkpoint_depths)
+        pop!(tp._checkpoint_n_active)
+    end
+
+    # 2. Normal Rewind Logic (Sentinel Pattern)
+    # Now the stack top is guaranteed to be at depth <= current depth.
+    if @inbounds tp._checkpoint_depths[end] == current_depth
+        # Checkpointed at current depth: pop and restore
+        pop!(tp._checkpoint_depths)
+        tp.n_active = pop!(tp._checkpoint_n_active)
+    else
+        # No checkpoint at current depth (this type was excluded from typed checkpoint)
+        # MUST restore n_active from parent checkpoint value!
+        # - Untracked acquire may have modified n_active
+        # - If sentinel (_checkpoint_n_active=[0]), restores to n_active=0
+        tp.n_active = @inbounds tp._checkpoint_n_active[end]
+    end
+    nothing
+end
+
+"""
+    _lazy_rewind!(pool::AdaptiveArrayPool)
+
+Complete rewind for lazy mode (`use_typed=false` macro path).
+
+Reads the combined mask at the current depth, rewinds only the fixed-slot pools
+whose bits are set, handles any `others` entries, then pops the depth metadata.
+
+Called directly from the macro-generated `finally` clause as a single function call
+(matching the structure of `_lazy_checkpoint!` for symmetry and performance).
+"""
+@inline function _lazy_rewind!(pool::AdaptiveArrayPool)
+
+    d    = pool._current_depth
+    bits = @inbounds(pool._touched_type_masks[d]) & _TYPE_BITS_MASK
+    _selective_rewind_fixed_slots!(pool, bits)
+    if @inbounds(pool._touched_has_others[d])
+        for tp in values(pool.others)
+            _rewind_typed_pool!(tp, d)
+        end
+    end
+    pop!(pool._touched_type_masks)
+    pop!(pool._touched_has_others)
+    pool._current_depth -= 1
+    nothing
+end
+
+"""
+    _typed_lazy_checkpoint!(pool::AdaptiveArrayPool, types::Type...)
+
+Typed checkpoint that enables lazy first-touch checkpointing for extra types touched
+by helpers (`use_typed=true`, `_can_use_typed_path=false` path).
+
+Calls `checkpoint!(pool, types...)` (checkpoints only the statically-known types),
+then sets `_TYPED_LAZY_BIT` (bit 14) in `_touched_type_masks[depth]` to signal typed lazy mode.
+
+`_record_type_touch!` checks `(mask & _MODE_BITS_MASK) != 0` (bit 14 OR bit 15) to trigger a
+lazy first-touch checkpoint for each extra type on first acquire, ensuring Case A
+(not Case B) applies at rewind and parent `n_active` is preserved correctly.
+"""
+@inline function _typed_lazy_checkpoint!(pool::AdaptiveArrayPool, types::Type...)
+    checkpoint!(pool, types...)
+    d = pool._current_depth
+    @inbounds pool._touched_type_masks[d] |= _TYPED_LAZY_BIT
+
+    # Eagerly snapshot pre-existing others entries — mirrors _lazy_checkpoint!.
+    # _record_type_touch! cannot lazy-checkpoint others types (b==0 branch, no per-type bit).
+    # Without this, a helper that re-acquires an already-active others type triggers Case B
+    # at rewind and restores the wrong parent n_active value.
+    #
+    # Also set has_others=true when pool.others is non-empty, so _typed_lazy_rewind!
+    # enters the others loop even for tracked non-fixed-slot types (e.g. CPU Float16) that
+    # used _acquire_impl! (bypassing _record_type_touch!, leaving has_others=false otherwise).
+    # Skip re-snapshot for entries already checkpointed at d by checkpoint!(pool, types...)
+    # (e.g. Float16 in types... was just checkpointed above — avoid double-push).
+    for p in values(pool.others)
+        if @inbounds(p._checkpoint_depths[end]) != d
+            _checkpoint_typed_pool!(p, d)
+        end
+        @inbounds pool._touched_has_others[d] = true
+    end
+    nothing
+end
+
+"""
+    _typed_lazy_rewind!(pool::AdaptiveArrayPool, tracked_mask::UInt16)
+
+Selective rewind for typed mode (`use_typed=true`) fallback path.
+
+Called when `_can_use_typed_path` returns false (helpers touched types beyond the
+statically-tracked set). Rewinds only pools whose bits are set in
+`tracked_mask | touched_mask`. All touched types have Case A checkpoints,
+guaranteed by the `_TYPED_LAZY_BIT` mode set in `_typed_lazy_checkpoint!`.
+"""
+@inline function _typed_lazy_rewind!(pool::AdaptiveArrayPool, tracked_mask::UInt16)
+
+    d = pool._current_depth
+    touched = @inbounds(pool._touched_type_masks[d]) & _TYPE_BITS_MASK
+    combined = tracked_mask | touched
+    _selective_rewind_fixed_slots!(pool, combined)
+    if @inbounds(pool._touched_has_others[d])
+        for tp in values(pool.others)
+            _rewind_typed_pool!(tp, d)
+        end
+    end
+    pop!(pool._touched_type_masks)
+    pop!(pool._touched_has_others)
+    pool._current_depth -= 1
+    nothing
+end
+
+"""
+    _selective_rewind_fixed_slots!(pool::AdaptiveArrayPool, mask::UInt16)
+
+Rewind only the fixed-slot typed pools whose bits are set in `mask`.
+
+Each of the 8 fixed-slot pools maps to bits 0–7 (same encoding as `_fixed_slot_bit`).
+Bits 8–15 (mode flags) are **not** checked here — callers must strip them
+before passing the mask (e.g. `mask & _TYPE_BITS_MASK`).
+"""
+@inline function _selective_rewind_fixed_slots!(pool::AdaptiveArrayPool, mask::UInt16)
+
+    d = pool._current_depth
+    _has_bit(mask, Float64)    && _rewind_typed_pool!(pool.float64,    d)
+    _has_bit(mask, Float32)    && _rewind_typed_pool!(pool.float32,    d)
+    _has_bit(mask, Int64)      && _rewind_typed_pool!(pool.int64,      d)
+    _has_bit(mask, Int32)      && _rewind_typed_pool!(pool.int32,      d)
+    _has_bit(mask, ComplexF64) && _rewind_typed_pool!(pool.complexf64, d)
+    _has_bit(mask, ComplexF32) && _rewind_typed_pool!(pool.complexf32, d)
+    _has_bit(mask, Bool)       && _rewind_typed_pool!(pool.bool,       d)
+    _has_bit(mask, Bit)        && _rewind_typed_pool!(pool.bits,       d)
+    nothing
+end
+
+# ==============================================================================
+# State Management - empty! (Legacy: N-way cache fields)
+# ==============================================================================
+
+"""
+    empty!(tp::BitTypedPool)
+
+Clear all internal storage for BitTypedPool, releasing all memory.
+Restores sentinel values for 1-based sentinel pattern.
+"""
+function Base.empty!(tp::BitTypedPool)
+    empty!(tp.vectors)
+    # Clear N-way cache
+    empty!(tp.nd_arrays)
+    empty!(tp.nd_dims)
+    empty!(tp.nd_ptrs)
+    empty!(tp.nd_next_way)
+    tp.n_active = 0
+    # Restore sentinel values (1-based sentinel pattern)
+    empty!(tp._checkpoint_n_active)
+    push!(tp._checkpoint_n_active, 0)   # Sentinel: n_active=0 at depth=0
+    empty!(tp._checkpoint_depths)
+    push!(tp._checkpoint_depths, 0)     # Sentinel: depth=0 = no checkpoint
+    return tp
+end
+
+"""
+    empty!(tp::TypedPool)
+
+Clear all internal storage for TypedPool, releasing all memory.
+Restores sentinel values for 1-based sentinel pattern.
+"""
+function Base.empty!(tp::TypedPool)
+    empty!(tp.vectors)
+    empty!(tp.views)
+    empty!(tp.view_lengths)
+    # Clear N-way cache
+    empty!(tp.nd_arrays)
+    empty!(tp.nd_dims)
+    empty!(tp.nd_ptrs)
+    empty!(tp.nd_next_way)
+    tp.n_active = 0
+    # Restore sentinel values (1-based sentinel pattern)
+    empty!(tp._checkpoint_n_active)
+    push!(tp._checkpoint_n_active, 0)   # Sentinel: n_active=0 at depth=0
+    empty!(tp._checkpoint_depths)
+    push!(tp._checkpoint_depths, 0)     # Sentinel: depth=0 = no checkpoint
+    return tp
+end
+
+"""
+    empty!(pool::AdaptiveArrayPool)
+
+Completely clear the pool, releasing all stored vectors and resetting all state.
+
+This is useful when you want to free memory or start fresh without creating
+a new pool instance.
+
+## Example
+```julia
+pool = AdaptiveArrayPool()
+v = acquire!(pool, Float64, 1000)
+# ... use v ...
+empty!(pool)  # Release all memory
+```
+
+## Warning
+Any SubArrays previously acquired from this pool become invalid after `empty!`.
+"""
+function Base.empty!(pool::AdaptiveArrayPool)
+    # Fixed slots - zero allocation via @generated iteration
+    foreach_fixed_slot(pool) do tp
+        empty!(tp)
+    end
+
+    # Others - clear all TypedPools then the IdDict itself
+    for tp in values(pool.others)
+        empty!(tp)
+    end
+    empty!(pool.others)
+
+    # Reset type touch tracking state (1-based sentinel pattern)
+    pool._current_depth = 1                   # 1 = global scope (sentinel)
+    empty!(pool._touched_type_masks)
+    push!(pool._touched_type_masks, UInt16(0))   # Sentinel: no bits set
+    empty!(pool._touched_has_others)
+    push!(pool._touched_has_others, false)         # Sentinel: no others
+
+    return pool
+end
+
+# ==============================================================================
+# State Management - reset!
+# ==============================================================================
+
+"""
+    reset!(tp::AbstractTypedPool)
+
+Reset state without clearing allocated storage.
+Sets `n_active = 0` and restores checkpoint stacks to sentinel state.
+"""
+function reset!(tp::AbstractTypedPool)
+    tp.n_active = 0
+    # Restore sentinel values (1-based sentinel pattern)
+    empty!(tp._checkpoint_n_active)
+    push!(tp._checkpoint_n_active, 0)   # Sentinel: n_active=0 at depth=0
+    empty!(tp._checkpoint_depths)
+    push!(tp._checkpoint_depths, 0)     # Sentinel: depth=0 = no checkpoint
+    return tp
+end
+
+"""
+    reset!(pool::AdaptiveArrayPool)
+
+Reset pool state without clearing allocated storage.
+
+This function:
+- Resets all `n_active` counters to 0
+- Restores all checkpoint stacks to sentinel state
+- Resets `_current_depth` and type touch tracking state
+
+Unlike `empty!`, this **preserves** all allocated vectors, views, and N-D arrays
+for reuse, avoiding reallocation costs.
+
+## Use Case
+When functions that acquire from the pool are called without proper
+`checkpoint!/rewind!` management, `n_active` can grow indefinitely.
+Use `reset!` to cleanly restore the pool to its initial state while
+keeping allocated memory available.
+
+## Example
+```julia
+pool = AdaptiveArrayPool()
+
+# Some function that acquires without checkpoint management
+function compute!(pool)
+    v = acquire!(pool, Float64, 100)
+    # ... use v ...
+    # No rewind! called
+end
+
+for _ in 1:1000
+    compute!(pool)  # n_active grows each iteration
+end
+
+reset!(pool)  # Restore state, keep allocated memory
+# Now pool.n_active == 0, but vectors are still available for reuse
+```
+
+See also: [`empty!`](@ref), [`rewind!`](@ref)
+"""
+function reset!(pool::AdaptiveArrayPool)
+    # Fixed slots - zero allocation via @generated iteration
+    foreach_fixed_slot(pool) do tp
+        reset!(tp)
+    end
+
+    # Others - reset all TypedPools
+    for tp in values(pool.others)
+        reset!(tp)
+    end
+
+    # Reset type touch tracking state (1-based sentinel pattern)
+    pool._current_depth = 1                   # 1 = global scope (sentinel)
+    empty!(pool._touched_type_masks)
+    push!(pool._touched_type_masks, UInt16(0))   # Sentinel: no bits set
+    empty!(pool._touched_has_others)
+    push!(pool._touched_has_others, false)         # Sentinel: no others
+
+    return pool
+end
+
+"""
+    reset!(pool::AdaptiveArrayPool, ::Type{T})
+
+Reset state for a specific type only. Clears n_active and checkpoint stacks
+to sentinel state while preserving allocated vectors.
+
+See also: [`reset!(::AdaptiveArrayPool)`](@ref), [`rewind!`](@ref)
+"""
+@inline function reset!(pool::AdaptiveArrayPool, ::Type{T}) where T
+    reset!(get_typed_pool!(pool, T))
+    pool
+end
+
+"""
+    reset!(pool::AdaptiveArrayPool, types::Type...)
+
+Reset state for multiple specific types. Uses @generated for zero-overhead
+compile-time unrolling.
+
+See also: [`reset!(::AdaptiveArrayPool)`](@ref), [`rewind!`](@ref)
+"""
+@generated function reset!(pool::AdaptiveArrayPool, types::Type...)
+    reset_exprs = [:(reset!(get_typed_pool!(pool, types[$i]))) for i in 1:length(types)]
+    quote
+        $(reset_exprs...)
+        pool
+    end
+end
+
+# ==============================================================================
+# Bitmask Helpers for Typed Path Decisions
+# ==============================================================================
+
+"""
+    _tracked_mask_for_types(types::Type...) -> UInt16
+
+Compute compile-time bitmask for the types tracked by a typed checkpoint/rewind.
+Uses `@generated` for zero-overhead constant folding.
+
+Returns `UInt16(0)` when called with no arguments.
+Non-fixed-slot types contribute `UInt16(0)` (their bit is 0).
+"""
+@generated function _tracked_mask_for_types(types::Type...)
+    mask = UInt16(0)
+    for i in 1:length(types)
+        T = types[i].parameters[1]
+        mask |= _fixed_slot_bit(T)
+    end
+    return :(UInt16($mask))
+end
+
+"""
+    _can_use_typed_path(pool::AbstractArrayPool, tracked_mask::UInt16) -> Bool
+
+Check if the typed (fast) checkpoint/rewind path is safe to use.
+
+Returns `true` when all touched types at the current depth are a subset
+of the tracked types (bitmask subset check) AND no non-fixed-slot types were touched.
+
+The subset check: `(touched_mask & ~tracked_mask) == 0` means every bit set
+in `touched_mask` is also set in `tracked_mask`.
+"""
+@inline function _can_use_typed_path(pool::AbstractArrayPool, tracked_mask::UInt16)
+    depth = pool._current_depth
+    touched_mask = @inbounds(pool._touched_type_masks[depth]) & _TYPE_BITS_MASK
+    has_others = @inbounds pool._touched_has_others[depth]
+    return (touched_mask & ~tracked_mask) == UInt16(0) && !has_others
+end
+
+# ==============================================================================
+# DisabledPool State Management (no-ops)
+# ==============================================================================
+# DisabledPool doesn't track state, so all operations are no-ops.
+
+checkpoint!(::DisabledPool) = nothing
+checkpoint!(::DisabledPool, ::Type) = nothing
+checkpoint!(::DisabledPool, types::Type...) = nothing
+
+rewind!(::DisabledPool) = nothing
+rewind!(::DisabledPool, ::Type) = nothing
+rewind!(::DisabledPool, types::Type...) = nothing
+
+reset!(::DisabledPool) = nothing
+reset!(::DisabledPool, ::Type) = nothing
+reset!(::DisabledPool, types::Type...) = nothing
+
+Base.empty!(::DisabledPool) = nothing
diff --git a/src/legacy/types.jl b/src/legacy/types.jl
new file mode 100644
index 00000000..b056ddbb
--- /dev/null
+++ b/src/legacy/types.jl
@@ -0,0 +1,447 @@
+# ==============================================================================
+# Constants (Configurable via Preferences) — Legacy (Julia ≤1.10)
+# ==============================================================================
+
+using Preferences
+
+"""
+Number of cache ways per slot for N-way set associative cache.
+Supports up to `CACHE_WAYS` different dimension patterns per slot without thrashing.
+
+Default: 4 (handles most use cases well)
+
+## Configuration
+```julia
+using AdaptiveArrayPools
+AdaptiveArrayPools.set_cache_ways!(8)  # Restart Julia to take effect
+```
+
+Or manually in `LocalPreferences.toml`:
+```toml
+[AdaptiveArrayPools]
+cache_ways = 8
+```
+
+Valid range: 1-16 (higher values increase memory but reduce eviction)
+"""
+const CACHE_WAYS = let
+    ways = @load_preference("cache_ways", 4)::Int
+    if ways < 1 || ways > 16
+        @warn "CACHE_WAYS=$ways out of range [1,16], using default 4"
+        4
+    else
+        ways
+    end
+end
+
+"""
+    set_cache_ways!(n::Int)
+
+Set the number of cache ways for N-D array caching.
+**Requires Julia restart to take effect.**
+
+Higher values reduce cache eviction but increase memory usage per slot.
+
+## Arguments
+- `n::Int`: Number of cache ways (valid range: 1-16)
+
+## Example
+```julia
+using AdaptiveArrayPools
+AdaptiveArrayPools.set_cache_ways!(8)  # Double the default
+# Restart Julia to apply the change
+```
+"""
+function set_cache_ways!(n::Int)
+    if n < 1 || n > 16
+        throw(ArgumentError("cache_ways must be in range [1, 16], got $n"))
+    end
+    @set_preferences!("cache_ways" => n)
+    @info "CACHE_WAYS set to $n. Restart Julia to apply."
+    return n
+end
+
+# ==============================================================================
+# Abstract Type Hierarchy (for extensibility)
+# ==============================================================================
+
+"""
+    AbstractTypedPool{T, V<:AbstractVector{T}}
+
+Abstract base for type-specific memory pools.
+"""
+abstract type AbstractTypedPool{T, V<:AbstractVector{T}} end
+
+"""
+    AbstractArrayPool
+
+Abstract base for multi-type array pools.
+"""
+abstract type AbstractArrayPool end
+
+# ==============================================================================
+# Disabled Pool Sentinel Types
+# ==============================================================================
+
+"""
+    DisabledPool{Backend}
+
+Sentinel type for disabled pooling that preserves backend context.
+When `USE_POOLING=false` (compile-time) or `MAYBE_POOLING_ENABLED[]=false` (runtime),
+macros return `DisabledPool{backend}()` instead of `nothing`.
+
+Backend symbols:
+- `:cpu` - Standard Julia arrays
+- `:cuda` - CUDA.jl CuArrays (defined in extension)
+
+This enables `@with_pool :cuda` to return correct array types even when pooling is off.
+
+## Example
+```julia
+# When USE_POOLING=false:
+@with_pool :cuda pool begin
+    v = zeros!(pool, 10)  # Returns CuArray{Float32}, not Array{Float64}!
+end
+```
+
+See also: [`pooling_enabled`](@ref), [`DISABLED_CPU`](@ref)
+"""
+struct DisabledPool{Backend} end
+
+"""
+    DISABLED_CPU
+
+Singleton instance for disabled CPU pooling.
+Used by macros when `USE_POOLING=false` without backend specification.
+"""
+const DISABLED_CPU = DisabledPool{:cpu}()
+
+"""
+    pooling_enabled(pool) -> Bool
+
+Returns `true` if `pool` is an active pool, `false` if pooling is disabled.
+
+## Examples
+```julia
+@maybe_with_pool pool begin
+    if pooling_enabled(pool)
+        # Using pooled memory
+    else
+        # Using standard allocation
+    end
+end
+```
+
+See also: [`DisabledPool`](@ref)
+"""
+pooling_enabled(::AbstractArrayPool) = true
+pooling_enabled(::DisabledPool) = false
+
+# ==============================================================================
+# Core Data Structures — Legacy (Julia ≤1.10, N-way set-associative cache)
+# ==============================================================================
+
+# 1-Based Sentinel Pattern: Arrays start with sentinel values to eliminate
+# isempty() checks in hot paths. See docstrings for details.
+
+"""
+    TypedPool{T} <: AbstractTypedPool{T, Vector{T}}
+
+Internal structure managing pooled vectors for a specific element type `T`.
+
+## Fields
+
+### Storage
+- `vectors`: Backing `Vector{T}` storage (actual memory allocation)
+
+### 1D Cache (for `acquire!(pool, T, n)`)
+- `views`: Cached `SubArray` views for zero-allocation 1D access
+- `view_lengths`: Cached lengths for fast Int comparison (SoA pattern)
+
+### N-D Array Cache (N-way set-associative, Julia ≤1.10)
+- `nd_arrays`: Cached N-D `Array` objects (length = slots × CACHE_WAYS)
+- `nd_dims`: Cached dimension tuples for cache hit validation
+- `nd_ptrs`: Cached pointer values to detect backing vector resize
+- `nd_next_way`: Round-robin counter per slot (length = slots)
+
+### State Management (1-based sentinel pattern)
+- `n_active`: Count of currently active (checked-out) arrays
+- `_checkpoint_n_active`: Saved n_active values at each checkpoint (sentinel: `[0]`)
+- `_checkpoint_depths`: Depth of each checkpoint entry (sentinel: `[0]`)
+"""
+mutable struct TypedPool{T} <: AbstractTypedPool{T, Vector{T}}
+    # --- Storage ---
+    vectors::Vector{Vector{T}}
+
+    # --- 1D Cache (1:1 mapping) ---
+    views::Vector{SubArray{T, 1, Vector{T}, Tuple{UnitRange{Int64}}, true}}
+    view_lengths::Vector{Int}
+
+    # --- N-D Array Cache (N-way set associative) ---
+    nd_arrays::Vector{Any}      # length = slots × CACHE_WAYS
+    nd_dims::Vector{Any}        # dimension tuples
+    nd_ptrs::Vector{UInt}       # pointer validation
+    nd_next_way::Vector{Int}    # round-robin counter per slot
+
+    # --- State Management (1-based sentinel pattern) ---
+    n_active::Int
+    _checkpoint_n_active::Vector{Int}   # Saved n_active at each checkpoint
+    _checkpoint_depths::Vector{Int}     # Depth of each checkpoint
+end
+
+TypedPool{T}() where {T} = TypedPool{T}(
+    # Storage
+    Vector{T}[],
+    # 1D Cache
+    SubArray{T, 1, Vector{T}, Tuple{UnitRange{Int64}}, true}[],
+    Int[],
+    # N-D Array Cache
+    Any[], Any[], UInt[], Int[],
+    # State Management (1-based sentinel pattern: guaranteed non-empty)
+    0,          # n_active
+    [0],        # _checkpoint_n_active: sentinel (n_active=0 at depth=0)
+    [0]         # _checkpoint_depths: sentinel (depth=0 = no checkpoint)
+)
+
+# ==============================================================================
+# Bit Sentinel Type
+# ==============================================================================
+
+"""
+    Bit
+
+Sentinel type for bit-packed boolean storage via `BitVector`.
+
+Use `Bit` instead of `Bool` in pool operations to get memory-efficient
+bit-packed arrays (1 bit per element vs 1 byte for `Vector{Bool}`).
+
+## Usage
+```julia
+@with_pool pool begin
+    # BitVector (1 bit per element, ~8x memory savings)
+    bv = acquire!(pool, Bit, 1000)
+
+    # vs Vector{Bool} (1 byte per element)
+    vb = acquire!(pool, Bool, 1000)
+
+    # Convenience functions work too
+    mask = falses!(pool, 100)       # BitVector filled with false
+    flags = trues!(pool, 100)       # BitVector filled with true
+end
+```
+
+## Return Types (Unified for Performance)
+Unlike other types, `Bit` always returns native `BitVector`/`BitArray`:
+- **1D**: `BitVector` (both `acquire!` and `unsafe_acquire!`)
+- **N-D**: `BitArray{N}` (reshaped, preserves SIMD optimization)
+
+This design ensures users always get SIMD-optimized performance without
+needing to remember which API to use.
+
+## Performance
+`BitVector` operations like `count()`, `sum()`, and bitwise operations are
+~(10x ~ 100x) faster than equivalent operations on `SubArray{Bool}` because they
+use SIMD-optimized algorithms on packed 64-bit chunks.
+
+```julia
+@with_pool pool begin
+    bv = acquire!(pool, Bit, 10000)
+    fill!(bv, true)
+    count(bv)  # Uses fast SIMD path automatically
+end
+```
+
+## Memory Safety
+The returned `BitVector` shares its internal `chunks` array with the pool.
+It is only valid within the `@with_pool` scope - using it after the scope
+ends leads to undefined behavior (use-after-free risk).
+
+See also: [`trues!`](@ref), [`falses!`](@ref), [`BitTypedPool`](@ref)
+"""
+struct Bit end
+
+# ==============================================================================
+# BitTypedPool — Legacy (Julia ≤1.10, N-way set-associative cache)
+# ==============================================================================
+
+"""
+    BitTypedPool <: AbstractTypedPool{Bool, BitVector}
+
+Specialized pool for `BitVector` arrays with memory reuse.
+
+## Fields
+- `vectors`: Backing `BitVector` storage
+- `nd_arrays`, `nd_dims`, `nd_ptrs`, `nd_next_way`: N-way cache (Julia ≤1.10)
+- `n_active`: Count of currently active arrays
+- `_checkpoint_*`: State management stacks (1-based sentinel pattern)
+
+See also: [`trues!`](@ref), [`falses!`](@ref), [`Bit`](@ref)
+"""
+mutable struct BitTypedPool <: AbstractTypedPool{Bool, BitVector}
+    # --- Storage ---
+    vectors::Vector{BitVector}
+
+    # --- N-D BitArray Cache (N-way set associative) ---
+    nd_arrays::Vector{Any}      # Cached BitArray{N} instances
+    nd_dims::Vector{Any}        # Cached dims (NTuple{N,Int})
+    nd_ptrs::Vector{UInt}       # pointer validation
+    nd_next_way::Vector{Int}    # round-robin counter per slot
+
+    # --- State Management (1-based sentinel pattern) ---
+    n_active::Int
+    _checkpoint_n_active::Vector{Int}
+    _checkpoint_depths::Vector{Int}
+end
+
+BitTypedPool() = BitTypedPool(
+    # Storage
+    BitVector[],
+    # N-D BitArray Cache
+    Any[], Any[], UInt[], Int[],
+    # State Management (1-based sentinel pattern)
+    0,          # n_active
+    [0],        # _checkpoint_n_active: sentinel
+    [0]         # _checkpoint_depths: sentinel
+)
+
+# ==============================================================================
+# Fixed Slot Configuration
+# ==============================================================================
+
+"""
+    FIXED_SLOT_FIELDS
+
+Field names for fixed slot TypedPools. Single source of truth for `foreach_fixed_slot`.
+
+When modifying, also update: struct definition, `get_typed_pool!` dispatches, constructor.
+Tests verify synchronization automatically.
+"""
+const FIXED_SLOT_FIELDS = (:float64, :float32, :int64, :int32, :complexf64, :complexf32, :bool, :bits)
+
+# ==============================================================================
+# Bitmask Mode Constants
+# ==============================================================================
+# Bits 0-7: fixed-slot type touch tracking (one bit per type)
+# Bits 14-15: mode flags set during checkpoint to control lazy behavior
+
+const _LAZY_MODE_BIT   = UInt16(0x8000)  # bit 15: lazy (dynamic-selective) checkpoint mode
+const _TYPED_LAZY_BIT  = UInt16(0x4000)  # bit 14: typed lazy-fallback mode
+const _MODE_BITS_MASK  = UInt16(0xC000)  # bits 14-15: all mode flags
+const _TYPE_BITS_MASK  = UInt16(0x00FF)  # bits 0-7: fixed-slot type bits
+
+# ==============================================================================
+# Fixed-Slot Bit Mapping (for type touch tracking)
+# ==============================================================================
+# Maps each fixed-slot type to a unique bit in a UInt16 bitmask.
+# Bit ordering matches FIXED_SLOT_FIELDS. Non-fixed types return UInt16(0).
+
+@inline _fixed_slot_bit(::Type{Float64})    = UInt16(1) << 0
+@inline _fixed_slot_bit(::Type{Float32})    = UInt16(1) << 1
+@inline _fixed_slot_bit(::Type{Int64})      = UInt16(1) << 2
+@inline _fixed_slot_bit(::Type{Int32})      = UInt16(1) << 3
+@inline _fixed_slot_bit(::Type{ComplexF64}) = UInt16(1) << 4
+@inline _fixed_slot_bit(::Type{ComplexF32}) = UInt16(1) << 5
+@inline _fixed_slot_bit(::Type{Bool})       = UInt16(1) << 6
+@inline _fixed_slot_bit(::Type{Bit})        = UInt16(1) << 7
+@inline _fixed_slot_bit(::Type)             = UInt16(0)  # non-fixed-slot → triggers has_others
+
+# Check whether a type's bit is set in a bitmask (e.g. _touched_type_masks or combined).
+@inline _has_bit(mask::UInt16, ::Type{T}) where {T} = (mask & _fixed_slot_bit(T)) != 0
+
+# ==============================================================================
+# AdaptiveArrayPool
+# ==============================================================================
+
+"""
+    AdaptiveArrayPool
+
+Multi-type memory pool with fixed slots for common types and IdDict fallback for others.
+Zero allocation after warmup. NOT thread-safe - use one pool per Task.
+"""
+mutable struct AdaptiveArrayPool <: AbstractArrayPool
+    # Fixed Slots: common types with zero lookup overhead
+    float64::TypedPool{Float64}
+    float32::TypedPool{Float32}
+    int64::TypedPool{Int64}
+    int32::TypedPool{Int32}
+    complexf64::TypedPool{ComplexF64}
+    complexf32::TypedPool{ComplexF32}
+    bool::TypedPool{Bool}
+    bits::BitTypedPool              # BitVector pool (1 bit per element)
+
+    # Fallback: rare types
+    others::IdDict{DataType, Any}
+
+    # Type touch tracking (1-based sentinel pattern)
+    _current_depth::Int             # Current scope depth (1 = global scope)
+    _touched_type_masks::Vector{UInt16}  # Per-depth: which fixed slots were touched + mode flags
+    _touched_has_others::Vector{Bool}    # Per-depth: any non-fixed-slot type touched?
+end
+
+function AdaptiveArrayPool()
+    AdaptiveArrayPool(
+        TypedPool{Float64}(),
+        TypedPool{Float32}(),
+        TypedPool{Int64}(),
+        TypedPool{Int32}(),
+        TypedPool{ComplexF64}(),
+        TypedPool{ComplexF32}(),
+        TypedPool{Bool}(),
+        BitTypedPool(),
+        IdDict{DataType, Any}(),
+        1,              # _current_depth: 1 = global scope (sentinel)
+        [UInt16(0)],    # _touched_type_masks: sentinel (no bits set)
+        [false]         # _touched_has_others: sentinel (no others)
+    )
+end
+
+# ==============================================================================
+# Type Dispatch (Zero-cost for Fixed Slots)
+# ==============================================================================
+
+# Fast Path: compile-time dispatch, fully inlined
+@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{Float64}) = p.float64
+@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{Float32}) = p.float32
+@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{Int64}) = p.int64
+@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{Int32}) = p.int32
+@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{ComplexF64}) = p.complexf64
+@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{ComplexF32}) = p.complexf32
+@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{Bool}) = p.bool
+@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{Bit}) = p.bits
+
+# Slow Path: rare types via IdDict
+@inline function get_typed_pool!(p::AdaptiveArrayPool, ::Type{T}) where {T}
+    get!(p.others, T) do
+        tp = TypedPool{T}()
+        # If inside a checkpoint scope (_current_depth > 1 means inside @with_pool),
+        # auto-checkpoint the new pool to prevent issues on rewind
+        if p._current_depth > 1
+            push!(tp._checkpoint_n_active, 0)  # n_active starts at 0
+            push!(tp._checkpoint_depths, p._current_depth)
+            # Signal that a fallback type was touched so lazy/typed-lazy rewind
+            # iterates pool.others. Without this, _acquire_impl! (which bypasses
+            # _record_type_touch!) would leave has_others=false, causing the
+            # rewind to skip pool.others entirely and leak this new type's n_active.
+            @inbounds p._touched_has_others[p._current_depth] = true
+        end
+        tp
+    end::TypedPool{T}
+end
+
+# ==============================================================================
+# Zero-Allocation Iteration
+# ==============================================================================
+
+"""
+    foreach_fixed_slot(f, pool::AdaptiveArrayPool)
+
+Apply `f` to each fixed slot TypedPool. Zero allocation via compile-time unrolling.
+"""
+@generated function foreach_fixed_slot(f::F, pool::AdaptiveArrayPool) where {F}
+    exprs = [:(f(getfield(pool, $(QuoteNode(field))))) for field in FIXED_SLOT_FIELDS]
+    quote
+        Base.@_inline_meta
+        $(exprs...)
+        nothing
+    end
+end
diff --git a/src/state.jl b/src/state.jl
index 79a20a1c..b591b520 100644
--- a/src/state.jl
+++ b/src/state.jl
@@ -388,11 +388,7 @@ Restores sentinel values for 1-based sentinel pattern.
 """
 function Base.empty!(tp::BitTypedPool)
     empty!(tp.vectors)
-    # Clear N-way wrapper cache
-    empty!(tp.nd_arrays)
-    empty!(tp.nd_dims)
-    empty!(tp.nd_ptrs)
-    empty!(tp.nd_next_way)
+    empty!(tp.nd_wrappers)
     tp.n_active = 0
     # Restore sentinel values (1-based sentinel pattern)
     empty!(tp._checkpoint_n_active)
@@ -412,11 +408,7 @@ function Base.empty!(tp::TypedPool)
     empty!(tp.vectors)
     empty!(tp.views)
     empty!(tp.view_lengths)
-    # Clear N-D Array cache (N-way)
-    empty!(tp.nd_arrays)
-    empty!(tp.nd_dims)
-    empty!(tp.nd_ptrs)
-    empty!(tp.nd_next_way)
+    empty!(tp.nd_wrappers)
     tp.n_active = 0
     # Restore sentinel values (1-based sentinel pattern)
     empty!(tp._checkpoint_n_active)
diff --git a/src/types.jl b/src/types.jl
index 8ccd2c41..c2a01c9b 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -158,11 +158,10 @@ Internal structure managing pooled vectors for a specific element type `T`.
 - `views`: Cached `SubArray` views for zero-allocation 1D access
 - `view_lengths`: Cached lengths for fast Int comparison (SoA pattern)
 
-### N-D Array Cache (for `unsafe_acquire!` only, N-way set associative)
-- `nd_arrays`: Cached N-D `Array` objects (length = slots × CACHE_WAYS)
-- `nd_dims`: Cached dimension tuples for cache hit validation
-- `nd_ptrs`: Cached pointer values to detect backing vector resize
-- `nd_next_way`: Round-robin counter per slot (length = slots)
+### N-D Wrapper Cache (Julia 1.11+, setfield!-based reuse)
+- `nd_wrappers`: `Vector{Union{Nothing, Vector{Any}}}` — indexed by N (dimensionality),
+  each entry is a per-slot cached `Array{T,N}` wrapper. Uses `setfield!(wrapper, :size, dims)`
+  and `setfield!(wrapper, :ref, parent)` for zero-allocation reuse of unlimited dim patterns.
 
 ### State Management (1-based sentinel pattern)
 - `n_active`: Count of currently active (checked-out) arrays
@@ -171,7 +170,7 @@ Internal structure managing pooled vectors for a specific element type `T`.
 
 ## Note
 `acquire!` for N-D returns `ReshapedArray` (zero creation cost), so no caching needed.
-Only `unsafe_acquire!` benefits from N-D caching since `unsafe_wrap` allocates 112 bytes.
+`unsafe_acquire!` uses `setfield!` wrapper reuse — unlimited dim patterns, 0-alloc after warmup.
 """
 mutable struct TypedPool{T} <: AbstractTypedPool{T, Vector{T}}
     # --- Storage ---
@@ -181,11 +180,8 @@ mutable struct TypedPool{T} <: AbstractTypedPool{T, Vector{T}}
     views::Vector{SubArray{T, 1, Vector{T}, Tuple{UnitRange{Int64}}, true}}
     view_lengths::Vector{Int}
 
-    # --- N-D Array Cache (N-way set associative) ---
-    nd_arrays::Vector{Any}      # length = slots × CACHE_WAYS
-    nd_dims::Vector{Any}        # dimension tuples
-    nd_ptrs::Vector{UInt}       # pointer validation
-    nd_next_way::Vector{Int}    # round-robin counter per slot
+    # --- N-D Wrapper Cache (setfield!-based reuse) ---
+    nd_wrappers::Vector{Union{Nothing, Vector{Any}}}  # index=N (dimensionality), value=per-slot Array{T,N}
 
     # --- State Management (1-based sentinel pattern) ---
     n_active::Int
@@ -199,11 +195,8 @@ TypedPool{T}() where {T} = TypedPool{T}(
     # 1D Cache
     SubArray{T, 1, Vector{T}, Tuple{UnitRange{Int64}}, true}[],
     Int[],
-    # N-D Array Cache (N-way)
-    Any[],
-    Any[],
-    UInt[],
-    Int[],
+    # N-D Wrapper Cache
+    Union{Nothing, Vector{Any}}[],
     # State Management (1-based sentinel pattern: guaranteed non-empty)
     0,          # n_active
     [0],        # _checkpoint_n_active: sentinel (n_active=0 at depth=0)
@@ -291,26 +284,10 @@ performance without needing to choose between APIs.
 
 ## Fields
 - `vectors`: Backing `BitVector` storage
-- `nd_arrays`: Cached wrapper BitVectors (chunks sharing)
-- `nd_dims`: Cached lengths for wrapper cache validation
-- `nd_ptrs`: Cached chunk pointers for invalidation detection
-- `nd_next_way`: Round-robin counter for N-way cache
+- `nd_wrappers`: `Vector{Union{Nothing, Vector{Any}}}` — setfield!-based cache (Julia 1.11+)
 - `n_active`: Count of currently active arrays
 - `_checkpoint_*`: State management stacks (1-based sentinel pattern)
 
-## Usage
-```julia
-@with_pool pool begin
-    # All return BitVector with SIMD performance
-    bv = acquire!(pool, Bit, 100)              # BitVector
-    count(bv)                                  # Fast SIMD path
-
-    # Convenience functions
-    t = trues!(pool, 50)                       # BitVector filled with true
-    f = falses!(pool, 50)                      # BitVector filled with false
-end
-```
-
 ## Performance
 Operations like `count()`, `sum()`, and bitwise operations are ~(10x ~ 100x) faster
 than equivalent operations on `SubArray{Bool}` because `BitVector` uses
@@ -322,15 +299,8 @@ mutable struct BitTypedPool <: AbstractTypedPool{Bool, BitVector}
     # --- Storage ---
     vectors::Vector{BitVector}
 
-    # --- N-D BitArray Cache (N-way set associative) ---
-    # Unlike TypedPool which uses views for 1D and nd_* for N-D,
-    # BitTypedPool uses nd_* for ALL dimensions (1D, 2D, 3D, etc.).
-    # No views needed since we always return BitArray{N}, not SubArray.
-    # BitArray.dims is mutable, enabling 0-alloc reuse for same-ndims requests.
-    nd_arrays::Vector{Any}      # Cached BitArray{N} instances
-    nd_dims::Vector{Any}        # Cached dims (NTuple{N,Int})
-    nd_ptrs::Vector{UInt}       # pointer validation
-    nd_next_way::Vector{Int}    # round-robin counter per slot
+    # --- N-D Wrapper Cache (setfield!-based reuse) ---
+    nd_wrappers::Vector{Union{Nothing, Vector{Any}}}  # index=N (dimensionality), value=per-slot BitArray{N}
 
     # --- State Management (1-based sentinel pattern) ---
     n_active::Int
@@ -341,11 +311,8 @@ end
 BitTypedPool() = BitTypedPool(
     # Storage
     BitVector[],
-    # 1D BitVector Wrapper Cache (N-way)
-    Any[],
-    Any[],
-    UInt[],
-    Int[],
+    # N-D Wrapper Cache
+    Union{Nothing, Vector{Any}}[],
     # State Management (1-based sentinel pattern)
     0,          # n_active
     [0],        # _checkpoint_n_active: sentinel
diff --git a/test/legacy/test_nway_cache.jl b/test/legacy/test_nway_cache.jl
new file mode 100644
index 00000000..1041c6c6
--- /dev/null
+++ b/test/legacy/test_nway_cache.jl
@@ -0,0 +1,189 @@
+using Test
+using AdaptiveArrayPools
+
+@testset "N-way Cache for unsafe_acquire!" begin
+
+    @testset "Type checks" begin
+        pool = AdaptiveArrayPool()
+
+        @with_pool pool begin
+            # acquire! returns ReshapedArray for N-D
+            arr = acquire!(pool, Float64, 10, 10)
+            @test arr isa Base.ReshapedArray{Float64, 2}
+
+            # acquire! returns SubArray for 1D
+            vec = acquire!(pool, Float64, 100)
+            @test vec isa SubArray{Float64, 1}
+
+            # unsafe_acquire! returns Array
+            raw = unsafe_acquire!(pool, Float64, 10, 10)
+            @test raw isa Array{Float64, 2}
+            @test raw isa Matrix{Float64}
+        end
+    end
+
+    @testset "CACHE_WAYS configuration" begin
+        # Verify CACHE_WAYS is exported and accessible
+        @test CACHE_WAYS isa Int
+        @test 1 <= CACHE_WAYS <= 16  # Valid range
+
+        # Verify set_cache_ways! is exported
+        @test isdefined(AdaptiveArrayPools, :set_cache_ways!)
+    end
+
+    @testset "set_cache_ways! validation" begin
+        # Valid values should return the input value
+        @test set_cache_ways!(1) == 1
+        @test set_cache_ways!(4) == 4
+        @test set_cache_ways!(8) == 8
+        @test set_cache_ways!(16) == 16
+
+        # Invalid values should throw ArgumentError
+        @test_throws ArgumentError set_cache_ways!(0)
+        @test_throws ArgumentError set_cache_ways!(-1)
+        @test_throws ArgumentError set_cache_ways!(17)
+        @test_throws ArgumentError set_cache_ways!(100)
+
+        # Reset to default after tests
+        set_cache_ways!(4)
+    end
+
+end
+
+@testset "N-way Zero-Allocation" begin
+
+    @testset "N-D unsafe_acquire!: 4-way alternating is zero-alloc" begin
+        pool = AdaptiveArrayPool()
+
+        function test_nd_4way!(p)
+            dims_list = ((5, 10), (10, 5), (7, 7), (3, 16))
+            for _ in 1:100
+                for dims in dims_list
+                    @with_pool p begin
+                        unsafe_acquire!(p, Float64, dims...)
+                    end
+                end
+            end
+        end
+
+        # Warmup
+        test_nd_4way!(pool)
+        test_nd_4way!(pool)
+
+        # Measure
+        allocs = @allocated test_nd_4way!(pool)
+        allocs > 0 && @warn "N-D 4-way: $allocs bytes (expected 0)"
+        @test allocs == 0
+    end
+
+    @testset "N-D acquire!: 5-way is zero-alloc (ReshapedArray)" begin
+        # acquire! returns ReshapedArray → no N-way cache needed → always 0 alloc
+        pool = AdaptiveArrayPool()
+
+        function test_nd_5way_acquire!(p)
+            dims_list = ((5, 10), (10, 5), (7, 7), (3, 16), (4, 12))
+            for _ in 1:100
+                for dims in dims_list
+                    @with_pool p begin
+                        acquire!(p, Float64, dims...)  # ReshapedArray
+                    end
+                end
+            end
+        end
+
+        # Warmup
+        test_nd_5way_acquire!(pool)
+        test_nd_5way_acquire!(pool)
+
+        # acquire! uses reshape(1D_view, dims) → 0 alloc regardless of pattern count
+        allocs = @allocated test_nd_5way_acquire!(pool)
+        allocs > 0 && @warn "N-D acquire! 5-way: $allocs bytes (expected 0)"
+        @test allocs == 0
+    end
+
+    @testset "N-D unsafe_acquire!: 5-way behavior" begin
+        pool = AdaptiveArrayPool()
+
+        function test_nd_5way_unsafe!(p)
+            dims_list = ((5, 10), (10, 5), (7, 7), (3, 16), (4, 12))
+            for _ in 1:100
+                for dims in dims_list
+                    @with_pool p begin
+                        unsafe_acquire!(p, Float64, dims...)
+                    end
+                end
+            end
+        end
+
+        # Warmup
+        test_nd_5way_unsafe!(pool)
+        test_nd_5way_unsafe!(pool)
+
+        allocs = @allocated test_nd_5way_unsafe!(pool)
+        # N-way eviction: 5 patterns > CACHE_WAYS=4
+        @test allocs > 0
+    end
+
+    @testset "Cache invalidation on resize" begin
+        pool = AdaptiveArrayPool()
+
+        # Warmup with small array
+        @with_pool pool begin
+            unsafe_acquire!(pool, Float64, 10, 10)
+        end
+
+        # Request larger array (forces resize, invalidates cache)
+        @with_pool pool begin
+            arr = unsafe_acquire!(pool, Float64, 100, 100)
+            @test size(arr) == (100, 100)
+        end
+
+        # Warmup again with new size
+        @with_pool pool begin
+            unsafe_acquire!(pool, Float64, 100, 100)
+        end
+
+        # Now should be zero allocation
+        allocs = @allocated begin
+            @with_pool pool begin
+                unsafe_acquire!(pool, Float64, 100, 100)
+            end
+        end
+        @test allocs == 0
+    end
+
+    @testset "Multiple slots with N-way cache" begin
+        pool = AdaptiveArrayPool()
+
+        # Warmup: each slot gets 2 different shapes
+        for _ in 1:2
+            @with_pool pool begin
+                unsafe_acquire!(pool, Float64, 5, 5)   # Slot 1
+                unsafe_acquire!(pool, Float64, 10, 10) # Slot 2
+            end
+            @with_pool pool begin
+                unsafe_acquire!(pool, Float64, 6, 6)   # Slot 1, different dims
+                unsafe_acquire!(pool, Float64, 12, 12) # Slot 2, different dims
+            end
+        end
+
+        # Both slots should have their shapes cached
+        allocs = @allocated begin
+            @with_pool pool begin
+                unsafe_acquire!(pool, Float64, 5, 5)
+                unsafe_acquire!(pool, Float64, 10, 10)
+            end
+        end
+        @test allocs == 0
+
+        # Alternating dims should also hit cache
+        allocs = @allocated begin
+            @with_pool pool begin
+                unsafe_acquire!(pool, Float64, 6, 6)
+                unsafe_acquire!(pool, Float64, 12, 12)
+            end
+        end
+        @test allocs == 0
+    end
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index f8cc5f2e..a0d0459c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,7 +1,14 @@
 using Test
 using AdaptiveArrayPools
 using AdaptiveArrayPools: get_typed_pool!
-import AdaptiveArrayPools: checkpoint!, rewind!  
+import AdaptiveArrayPools: checkpoint!, rewind!
+
+# Version-specific helpers (always defined, even for ARGS path)
+@static if VERSION >= v"1.11-"
+    _test_nd_cache_preserved(tp) = !isempty(tp.nd_wrappers)
+else
+    _test_nd_cache_preserved(tp) = length(tp.nd_arrays) >= 1
+end
 
 # Check if specific test files are requested via ARGS
 if !isempty(ARGS)
@@ -10,26 +17,50 @@ if !isempty(ARGS)
         include(testfile)
     end
 else
-    include("test_aqua.jl")
-    include("test_basic.jl")
-    include("test_state.jl")
-    include("test_multidimensional.jl")
-    include("test_macros.jl")
-    include("test_task_local_pool.jl")
-    include("test_utils.jl")
-    include("test_macro_expansion.jl")
-    include("test_macro_internals.jl")
-    include("test_zero_allocation.jl")
-    include("test_disabled_pooling.jl")
-    include("test_aliases.jl")
-    include("test_nway_cache.jl")
-    include("test_fixed_slots.jl")
-    include("test_backend_macro_expansion.jl")
-    include("test_convenience.jl")
-    include("test_bitarray.jl")
-    include("test_coverage.jl")
-    include("test_allocation.jl")
-    include("test_fallback_reclamation.jl")
+    # Version-specific test file selection
+    @static if VERSION >= v"1.11-"
+        include("test_aqua.jl")
+        include("test_basic.jl")
+        include("test_state.jl")
+        include("test_multidimensional.jl")
+        include("test_macros.jl")
+        include("test_task_local_pool.jl")
+        include("test_utils.jl")
+        include("test_macro_expansion.jl")
+        include("test_macro_internals.jl")
+        include("test_zero_allocation.jl")
+        include("test_disabled_pooling.jl")
+        include("test_aliases.jl")
+        include("test_nway_cache.jl")
+        include("test_fixed_slots.jl")
+        include("test_backend_macro_expansion.jl")
+        include("test_convenience.jl")
+        include("test_bitarray.jl")
+        include("test_coverage.jl")
+        include("test_allocation.jl")
+        include("test_fallback_reclamation.jl")
+    else
+        include("test_aqua.jl")
+        include("test_basic.jl")
+        include("test_state.jl")
+        include("test_multidimensional.jl")
+        include("test_macros.jl")
+        include("test_task_local_pool.jl")
+        include("test_utils.jl")
+        include("test_macro_expansion.jl")
+        include("test_macro_internals.jl")
+        include("test_zero_allocation.jl")
+        include("test_disabled_pooling.jl")
+        include("test_aliases.jl")
+        include("legacy/test_nway_cache.jl")
+        include("test_fixed_slots.jl")
+        include("test_backend_macro_expansion.jl")
+        include("test_convenience.jl")
+        include("test_bitarray.jl")
+        include("test_coverage.jl")
+        include("test_allocation.jl")
+        include("test_fallback_reclamation.jl")
+    end
 
     # CUDA extension tests (auto-detect, skip with TEST_CUDA=false)
     if get(ENV, "TEST_CUDA", "true") != "false"
diff --git a/test/test_nway_cache.jl b/test/test_nway_cache.jl
index c5051e74..cd1c2c76 100644
--- a/test/test_nway_cache.jl
+++ b/test/test_nway_cache.jl
@@ -101,8 +101,7 @@ end
         @test allocs == 0
     end
 
-    @testset "N-D unsafe_acquire!: 5-way causes allocation (cache eviction)" begin
-        # unsafe_acquire! uses N-way cache → 5-way exceeds CACHE_WAYS=4
+    @testset "N-D unsafe_acquire!: 5-way behavior" begin
         pool = AdaptiveArrayPool()
 
         function test_nd_5way_unsafe!(p)
@@ -110,19 +109,45 @@ end
             for _ in 1:100
                 for dims in dims_list
                     @with_pool p begin
-                        unsafe_acquire!(p, Float64, dims...)  # Array with cache
+                        unsafe_acquire!(p, Float64, dims...)
                     end
                 end
             end
         end
 
-        # Warmup (fills cache with 4 patterns, 5th evicts one)
+        # Warmup
         test_nd_5way_unsafe!(pool)
         test_nd_5way_unsafe!(pool)
 
-        # 5-way exceeds 4-way cache → eviction → unsafe_wrap allocation
         allocs = @allocated test_nd_5way_unsafe!(pool)
-        @test allocs > 0
+        # setfield! reuse: unlimited dim patterns, 0-alloc
+        allocs > 0 && @warn "N-D 5-way unsafe: $allocs bytes (expected 0)"
+        @test allocs == 0
+    end
+
+    @testset "N-D unsafe_acquire!: 10+ patterns per slot is zero-alloc" begin
+        # Demonstrates removal of CACHE_WAYS limit via setfield! (Julia 1.11+)
+        pool = AdaptiveArrayPool()
+
+        function test_nd_many_patterns!(p)
+            dims_list = ((2, 50), (5, 20), (10, 10), (20, 5), (50, 2),
+                         (1, 100), (100, 1), (4, 25), (25, 4), (8, 13))
+            for _ in 1:50
+                for dims in dims_list
+                    @with_pool p begin
+                        unsafe_acquire!(p, Float64, dims...)
+                    end
+                end
+            end
+        end
+
+        # Warmup
+        test_nd_many_patterns!(pool)
+        test_nd_many_patterns!(pool)
+
+        allocs = @allocated test_nd_many_patterns!(pool)
+        allocs > 0 && @warn "N-D 10+ patterns: $allocs bytes (expected 0)"
+        @test allocs == 0
     end
 
     @testset "Cache invalidation on resize" begin
@@ -188,3 +213,179 @@ end
     end
 
 end
+
+# ==============================================================================
+# Vector-Based N-D Wrapper Cache Tests (Julia 1.11+)
+# ==============================================================================
+# These tests verify the Dict→Vector migration for nd_wrappers.
+
+@testset "Vector-based nd_wrappers cache" begin
+    using AdaptiveArrayPools: checkpoint!, rewind!
+
+    @testset "nd_wrappers grows correctly for multiple dimensionalities" begin
+        pool = AdaptiveArrayPool()
+        checkpoint!(pool)
+
+        # N=1: 1D unsafe_acquire
+        v1 = unsafe_acquire!(pool, Float64, 10)
+        @test length(pool.float64.nd_wrappers) >= 1
+
+        # N=2: 2D unsafe_acquire — nd_wrappers should grow to index 2
+        m1 = unsafe_acquire!(pool, Float64, 3, 4)
+        @test length(pool.float64.nd_wrappers) >= 2
+        @test pool.float64.nd_wrappers[2] !== nothing  # has a Vector{Any} for N=2
+
+        # N=3: 3D unsafe_acquire — nd_wrappers should grow to index 3
+        t1 = unsafe_acquire!(pool, Float64, 2, 3, 4)
+        @test length(pool.float64.nd_wrappers) >= 3
+        @test pool.float64.nd_wrappers[3] !== nothing  # has a Vector{Any} for N=3
+
+        rewind!(pool)
+    end
+
+    @testset "wrapper object identity is preserved on cache hit" begin
+        pool = AdaptiveArrayPool()
+
+        # First call: cache miss → creates wrapper
+        checkpoint!(pool)
+        m1 = unsafe_acquire!(pool, Float64, 3, 4)
+        wrapper_id = objectid(m1)
+        rewind!(pool)
+
+        # Second call: cache hit → same wrapper object, updated fields
+        checkpoint!(pool)
+        m2 = unsafe_acquire!(pool, Float64, 5, 6)
+        @test objectid(m2) == wrapper_id  # same Array object reused via setfield!
+        @test size(m2) == (5, 6)          # dims updated in-place
+        rewind!(pool)
+
+        # Third call with same dims: still same wrapper
+        checkpoint!(pool)
+        m3 = unsafe_acquire!(pool, Float64, 5, 6)
+        @test objectid(m3) == wrapper_id
+        @test size(m3) == (5, 6)
+        rewind!(pool)
+    end
+
+    @testset "different N values use independent wrapper slots" begin
+        pool = AdaptiveArrayPool()
+        checkpoint!(pool)
+
+        # Slot 1 as 2D
+        m = unsafe_acquire!(pool, Float64, 3, 4)
+        rewind!(pool)
+
+        checkpoint!(pool)
+        # Slot 1 as 3D — different N, must create new wrapper
+        t = unsafe_acquire!(pool, Float64, 2, 3, 4)
+        @test size(t) == (2, 3, 4)
+
+        # Both N=2 and N=3 entries exist
+        @test pool.float64.nd_wrappers[2] !== nothing
+        @test pool.float64.nd_wrappers[3] !== nothing
+        rewind!(pool)
+    end
+
+    @testset "nd_wrappers with nothing gaps for skipped N" begin
+        pool = AdaptiveArrayPool()
+        checkpoint!(pool)
+
+        # Jump directly to N=3 without using N=2
+        t = unsafe_acquire!(pool, Float64, 2, 3, 4)
+        @test length(pool.float64.nd_wrappers) >= 3
+
+        # N=2 entry should be nothing (never used for N=2)
+        @test pool.float64.nd_wrappers[2] === nothing
+
+        rewind!(pool)
+    end
+
+    @testset "BitTypedPool nd_wrappers cache" begin
+        pool = AdaptiveArrayPool()
+        checkpoint!(pool)
+
+        # 1D BitArray
+        bv = acquire!(pool, Bit, 100)
+        @test length(pool.bits.nd_wrappers) >= 1
+
+        # 2D BitArray
+        ba = acquire!(pool, Bit, 10, 10)
+        @test length(pool.bits.nd_wrappers) >= 2
+        @test pool.bits.nd_wrappers[2] !== nothing
+
+        rewind!(pool)
+
+        # Verify wrapper reuse for BitArray
+        checkpoint!(pool)
+        bv2 = acquire!(pool, Bit, 50)
+        ba2 = acquire!(pool, Bit, 5, 20)
+        @test size(ba2) == (5, 20)
+        rewind!(pool)
+    end
+
+    @testset "empty! clears nd_wrappers" begin
+        pool = AdaptiveArrayPool()
+        checkpoint!(pool)
+        unsafe_acquire!(pool, Float64, 3, 4)
+        rewind!(pool)
+
+        @test !isempty(pool.float64.nd_wrappers)
+        empty!(pool)
+        @test isempty(pool.float64.nd_wrappers)
+    end
+
+    @testset "multiple element types have independent nd_wrappers" begin
+        pool = AdaptiveArrayPool()
+        checkpoint!(pool)
+
+        mf = unsafe_acquire!(pool, Float64, 3, 4)
+        mi = unsafe_acquire!(pool, Int64, 5, 6)
+
+        @test pool.float64.nd_wrappers[2] !== nothing
+        @test pool.int64.nd_wrappers[2] !== nothing
+
+        # They must be separate Vector{Any} instances
+        @test pool.float64.nd_wrappers[2] !== pool.int64.nd_wrappers[2]
+
+        rewind!(pool)
+    end
+
+    # Function barrier for accurate allocation measurement
+    function test_mixed_nd_zero_alloc()
+        pool = AdaptiveArrayPool()
+
+        # Warmup: exercise N=1, N=2, N=3 for same slot
+        for _ in 1:2
+            @with_pool pool begin
+                unsafe_acquire!(pool, Float64, 100)
+            end
+            @with_pool pool begin
+                unsafe_acquire!(pool, Float64, 10, 10)
+            end
+            @with_pool pool begin
+                unsafe_acquire!(pool, Float64, 5, 4, 5)
+            end
+        end
+
+        # Measure: all three should be cache hits
+        a1 = @allocated @with_pool pool begin
+            unsafe_acquire!(pool, Float64, 50)
+        end
+        a2 = @allocated @with_pool pool begin
+            unsafe_acquire!(pool, Float64, 7, 7)
+        end
+        a3 = @allocated @with_pool pool begin
+            unsafe_acquire!(pool, Float64, 3, 3, 3)
+        end
+        return (a1, a2, a3)
+    end
+
+    @testset "mixed dimensionalities zero-alloc after warmup" begin
+        test_mixed_nd_zero_alloc()
+        test_mixed_nd_zero_alloc()
+        a1, a2, a3 = test_mixed_nd_zero_alloc()
+        @test a1 == 0
+        @test a2 == 0
+        @test a3 == 0
+    end
+end
diff --git a/test/test_state.jl b/test/test_state.jl
index a0f67b31..4254c87d 100644
--- a/test/test_state.jl
+++ b/test/test_state.jl
@@ -287,7 +287,7 @@ import AdaptiveArrayPools: _typed_lazy_checkpoint!, _typed_lazy_rewind!, _tracke
             @test pool.float64.n_active == 0
             @test length(pool.float64.vectors) >= 3    # Vectors preserved
             @test length(pool.float64.views) >= 1      # 1D cache preserved
-            @test length(pool.float64.nd_arrays) >= 1  # N-D cache preserved
+            @test _test_nd_cache_preserved(pool.float64)    # N-D cache preserved
         end
 
         @testset "reset! restores checkpoint stacks to sentinel" begin