diff --git a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl
index 8abb3672..03c2ecf7 100644
--- a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl
+++ b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl
@@ -10,19 +10,6 @@ module AdaptiveArrayPoolsCUDAExt
 
 using AdaptiveArrayPools
 using AdaptiveArrayPools: AbstractTypedPool, AbstractArrayPool
-using Preferences: @load_preference, @set_preferences!
-
-# N-way view cache configuration (CUDA only — CPU ≥1.11 uses slot-first _claim_slot!).
-# GPU view/reshape allocates ~80 bytes on CPU heap, so caching still matters.
-const CACHE_WAYS = let
-    ways = @load_preference("cache_ways", 4)::Int
-    if ways < 1 || ways > 16
-        @warn "CACHE_WAYS=$ways out of range [1,16], using default 4"
-        4
-    else
-        ways
-    end
-end
 using CUDA
 
 # Type definitions
@@ -31,7 +18,7 @@ include("types.jl")
 # Dispatch methods (allocate_vector, wrap_array, get_typed_pool!)
 include("dispatch.jl")
 
-# GPU-specific get_view! implementation
+# GPU-specific acquire (arr_wrappers + setfield!, _resize_to_fit!, _reshape_impl!)
 include("acquire.jl")
 
 # Task-local pool (multi-device aware)
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl
index 21a8075a..a0a77fb1 100644
--- a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl
+++ b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl
@@ -1,192 +1,315 @@
 # ==============================================================================
-# CUDA-Specific Unified get_view! Implementation (N-Way Cache)
+# CUDA-Specific Acquire Implementation (arr_wrappers + setfield!)
 # ==============================================================================
-# Unlike CPU, GPU views (view(CuVector, 1:n)) return CuVector via GPUArrays derive(),
-# NOT SubArray. Similarly, reshape() returns CuArray, not ReshapedArray.
-# This allows a single unified implementation for all dimensions.
+# Mirrors CPU's Julia 1.11+ approach: cached CuArray{T,N} wrappers reused via
+# setfield!(:dims) for zero-allocation on cache hit.
 #
-# N-way cache layout (flat vector):
-#   views[(slot-1)*CACHE_WAYS + way] for way ∈ 1:CACHE_WAYS
-#
-# Cache lookup uses simple for loop - measured overhead ~16 bytes (acceptable).
+# Key differences from CPU:
+# - CPU Array has :ref (MemoryRef, GC-managed, no refcount)
+# - CuArray has :data (DataRef, manual refcount via Threads.Atomic{Int})
+# - We minimize refcount overhead via wrapper.data.rc !== vec.data.rc identity
+#   check (~2ns). Only update :data when GPU buffer actually changed (rare).
 #
 # ==============================================================================
-# Memory Resize Strategy: _resize_without_shrink!
+# Memory Resize Strategy: _resize_to_fit!
 # ==============================================================================
-# GPU vs CPU difference:
-#   - CPU Vector: resize!(v, smaller) preserves capacity (pointer unchanged, cheap)
-#   - GPU CuVector: resize!(v, smaller) may reallocate when n < capacity÷4
-#     (CUDA.jl's 25% threshold triggers pool_alloc + copy + pool_free)
-#
-# Problem: Pool operations frequently shrink backing vectors:
-#   - Safety invalidation: resize!(vec, 0) to invalidate released slots
-#   - Acquire path: resize!(vec, smaller_size) when reusing a slot for smaller array
-#   Both trigger expensive GPU reallocation via CUDA.jl's 25% threshold.
+# CUDA.jl resize! behavior varies by version:
+#   - v5.9.x: ALWAYS reallocates (no capacity management)
+#   - v5.10.x+: capacity check — reallocates only when n > cap or n < cap÷4
 #
-# Solution: _resize_without_shrink!(A, n)
-#   - Grow (n > length): delegates to CUDA.jl resize! (may allocate more GPU memory)
-#   - Shrink (n < length): setfield!(A, :dims, (n,)) — logical size only, no GPU op
-#   - Equal (n == length): no-op
+# _resize_to_fit!(A, n):
+#   - n within capacity (maxsize): setfield!(:dims) only — no GPU operation
+#   - n beyond capacity: delegates to CUDA.jl resize! (GPU alloc)
+#   - n == length(A): no-op
 #
-# Key property: maxsize is preserved on shrink. When later growing back,
-# CUDA.jl computes cap = maxsize ÷ aligned_sizeof(T) and sees n ≤ cap,
-# so no reallocation occurs. This is ideal for pool's borrow/return pattern.
+# This is strictly better than _resize_without_shrink! (which only optimized
+# shrink). _resize_to_fit! also optimizes grow-within-capacity, critical for
+# re-acquire after safety invalidation (dims=(0,), maxsize preserved).
 #
-# ⚠ Depends on CuArray internal fields (:dims, .maxsize). Tested with CUDA.jl v5.x.
+# ⚠ Depends on CuArray internal fields (:data, :dims, :maxsize, :offset).
+#   Tested with CUDA.jl v5.x.
 # ==============================================================================
 
 using AdaptiveArrayPools: get_view!, get_array!, allocate_vector, safe_prod,
     _record_type_touch!, _fixed_slot_bit, _checkpoint_typed_pool!,
+    _store_arr_wrapper!, _check_pool_growth, _reshape_impl!,
+    _acquire_impl!, _unsafe_acquire_impl!, _maybe_record_borrow!,
     _MODE_BITS_MASK
 
+using CUDA.GPUArrays: unsafe_free!
+
 # Guard against CUDA.jl internal API changes (tested with v5.x).
-# setfield!(:dims) requires CuArray to be mutable and have a :dims field.
-@static if !(ismutabletype(CuArray) && hasfield(CuArray, :dims))
-    error("Unsupported CUDA.jl version: expected mutable CuArray with field :dims. _resize_without_shrink! needs updating.")
+@static if !(
+        ismutabletype(CuArray) && hasfield(CuArray, :dims) &&
+            hasfield(CuArray, :data) && hasfield(CuArray, :maxsize) &&
+            hasfield(CuArray, :offset)
+    )
+    error("Unsupported CUDA.jl version: CuArray must be mutable with :data, :dims, :maxsize, :offset fields.")
 end
 
+# ==============================================================================
+# Aligned sizeof (mirrors CUDA.jl internal)
+# ==============================================================================
+
+"""Compute aligned element size, matching CUDA.jl's internal `aligned_sizeof`."""
+_aligned_sizeof(::Type{T}) where {T} = max(sizeof(T), Base.datatype_alignment(T))
+
+# ==============================================================================
+# _resize_to_fit! — Capacity-Aware Resize (superset of _resize_without_shrink!)
+# ==============================================================================
+
 """
-    _resize_without_shrink!(A::CuVector{T}, n::Integer) -> CuVector{T}
+    _resize_to_fit!(A::CuVector{T}, n::Integer) -> CuVector{T}
 
-Resize a CuVector's logical length without freeing GPU memory on shrink.
+Resize a CuVector's logical length, using `setfield!(:dims)` when within capacity.
 
-- `n > length(A)`: delegates to `resize!(A, n)` (may grow GPU allocation)
+- `n > capacity`: delegates to `resize!(A, n)` (may grow GPU allocation)
+- `n ≤ capacity, n ≠ length(A)`: `setfield!(:dims)` only — no GPU operation
 - `n == length(A)`: no-op
-- `n < length(A)`: only updates `dims` field (GPU memory preserved at `maxsize`)
-
-Avoids CUDA.jl's 25% threshold reallocation on shrink (`n < cap÷4` triggers
-`pool_alloc` + `unsafe_copyto!` + `pool_free`), which is expensive for pool
-operations like safety invalidation (`resize!(v, 0)`) and acquire-path resizing.
-"""
-@inline function _resize_without_shrink!(A::CuVector{T}, n::Integer) where {T}
-    current = length(A)
-    if n > current
-        resize!(A, n)                       # grow: delegate to CUDA.jl
-    elseif n < current
-        setfield!(A, :dims, (Int(n),))      # shrink: dims only, GPU memory preserved
+
+Capacity = `A.maxsize ÷ aligned_sizeof(T)`. Since `setfield!(:dims)` preserves
+`maxsize`, capacity information is naturally retained across shrink/grow cycles.
+"""
+@inline function _resize_to_fit!(A::CuVector{T}, n::Integer) where {T}
+    cap = A.maxsize ÷ _aligned_sizeof(T)
+    if n > cap
+        resize!(A, n)                       # Beyond capacity: delegate to CUDA.jl
+    elseif n != length(A)
+        setfield!(A, :dims, (Int(n),))      # Within capacity: dims only
     end
     return A
 end
 
+# ==============================================================================
+# _cuda_claim_slot! — Capacity-Based Slot Claim
+# ==============================================================================
+
 """
-    get_view!(tp::CuTypedPool{T}, n::Int) -> CuVector{T}
+    _cuda_claim_slot!(tp::CuTypedPool{T}, total_len::Int) -> Int
 
-1D convenience wrapper - delegates to tuple version.
-`(n,)` is stack-allocated (isbits NTuple), so this is zero-allocation when inlined.
+Claim the next slot, ensuring the backing vector's GPU buffer has capacity ≥ `total_len`.
+Uses maxsize-based capacity check instead of length check to avoid triggering
+CUDA.jl's resize! unnecessarily (especially after safety invalidation sets dims=(0,)).
 """
-@inline function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, n::Int) where {T}
-    return get_view!(tp, (n,))
+@inline function _cuda_claim_slot!(tp::CuTypedPool{T}, total_len::Int) where {T}
+    tp.n_active += 1
+    idx = tp.n_active
+    if idx > length(tp.vectors)
+        push!(tp.vectors, allocate_vector(tp, total_len))
+        _check_pool_growth(tp, idx)
+    else
+        # _resize_to_fit! handles all cases:
+        # - n > capacity: resize! (GPU alloc)
+        # - n != length: setfield!(:dims) — restores length after safety invalidation
+        # - n == length: no-op (hot path)
+        _resize_to_fit!(@inbounds(tp.vectors[idx]), total_len)
+    end
+    return idx
 end
 
 """
-    get_view!(tp::CuTypedPool{T}, dims::NTuple{N,Int}) -> CuArray{T,N}
+    _cuda_claim_slot!(tp::CuTypedPool{T}) -> Int
 
-Get an N-dimensional view from the pool with unified N-way caching.
-Returns cached view on hit (near-zero CPU allocation), creates new on miss.
+Claim the next slot without provisioning memory (zero-length backing vector).
+Used by `_reshape_impl!` which only needs the slot index for wrapper caching —
+the wrapper points to a different array's memory via `setfield!(:data)`.
+"""
+@inline function _cuda_claim_slot!(tp::CuTypedPool{T}) where {T}
+    tp.n_active += 1
+    idx = tp.n_active
+    if idx > length(tp.vectors)
+        push!(tp.vectors, CuVector{T}(undef, 0))
+        _check_pool_growth(tp, idx)
+    end
+    return idx
+end
 
-## N-Way Cache Behavior
-- Each slot has CACHE_WAYS (4) cache entries for different dimension patterns
-- Cache lookup uses simple for loop (~16 bytes overhead)
-- Cache replacement uses round-robin when all ways are occupied
+# ==============================================================================
+# _update_cuda_wrapper_data! — DataRef Refcount Management
+# ==============================================================================
 
-## GPU-Specific Behavior
-- GPU `view()` returns `CuVector` (not SubArray)
-- GPU `reshape()` returns `CuArray{T,N}` (not ReshapedArray)
-- Both allocate ~80 bytes on CPU heap for the wrapper object
-- N-way caching eliminates this allocation on cache hit
+"""
+    _update_cuda_wrapper_data!(cu::CuArray, source::CuArray)
 
-## Memory Resize Strategy
-Backing vectors use `_resize_without_shrink!`: grow delegates to CUDA.jl's
-`resize!` (may reallocate), shrink only updates `dims` (GPU memory preserved).
-See module header for details.
+Update wrapper's GPU data reference when the source's buffer has changed.
+Decrements old refcount, increments new. @noinline: rare path (only on grow
+beyond capacity), keep off the hot inlined acquire path.
 """
-@inline function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N}
-    tp.n_active += 1
-    idx = tp.n_active
-    total_len = safe_prod(dims)
+@noinline function _update_cuda_wrapper_data!(cu::CuArray, source::CuArray)
+    unsafe_free!(cu.data)
+    setfield!(cu, :data, copy(source.data))
+    setfield!(cu, :maxsize, source.maxsize)
+    setfield!(cu, :offset, 0)
+    return nothing
+end
 
-    # 1. Expand pool if needed (new slot)
-    if idx > length(tp.vectors)
-        push!(tp.vectors, allocate_vector(tp, total_len))
-        @inbounds vec = tp.vectors[idx]
-        new_view = view(vec, 1:total_len)
-        nd_view = N == 1 ? new_view : reshape(new_view, dims)
-
-        # Initialize N-way cache entries for this slot
-        for _ in 1:CACHE_WAYS
-            push!(tp.views, nothing)
-            push!(tp.view_dims, nothing)
-        end
-        push!(tp.next_way, 1)
+# ==============================================================================
+# _acquire_impl! / _unsafe_acquire_impl! — Direct get_array! Dispatch
+# ==============================================================================
+# On CUDA, both acquire! and unsafe_acquire! go through get_array! directly.
+# No view/array distinction — CuArray is always returned.
+# This eliminates the get_view! → get_array! indirection that CPU still uses
+# for the acquire! (view) path.
 
-        # Store in first way
-        base = (idx - 1) * CACHE_WAYS
-        @inbounds tp.views[base + 1] = nd_view
-        @inbounds tp.view_dims[base + 1] = dims
+"""
+    _acquire_impl!(pool::CuAdaptiveArrayPool, T, n) -> CuArray{T,1}
+    _acquire_impl!(pool::CuAdaptiveArrayPool, T, dims...) -> CuArray{T,N}
 
-        # Warn at powers of 2 (512, 1024, 2048, ...) - possible missing rewind!()
-        if idx >= 512 && (idx & (idx - 1)) == 0
-            total_bytes = sum(length, tp.vectors) * sizeof(T)
-            @warn "CuTypedPool{$T} growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?"
-        end
+CUDA override: routes directly to `get_array!` (no view indirection).
+"""
+@inline function AdaptiveArrayPools._acquire_impl!(pool::CuAdaptiveArrayPool, ::Type{T}, n::Int) where {T}
+    tp = get_typed_pool!(pool, T)
+    result = get_array!(tp, (n,))
+    _maybe_record_borrow!(pool, tp)
+    return result
+end
 
-        return nd_view
-    end
+@inline function AdaptiveArrayPools._acquire_impl!(pool::CuAdaptiveArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N}
+    tp = get_typed_pool!(pool, T)
+    result = get_array!(tp, dims)
+    _maybe_record_borrow!(pool, tp)
+    return result
+end
 
-    # 2. N-way cache lookup with for loop
-    base = (idx - 1) * CACHE_WAYS
-    for k in 1:CACHE_WAYS
-        cache_idx = base + k
-        @inbounds cached_dims = tp.view_dims[cache_idx]
-        if cached_dims isa NTuple{N, Int} && cached_dims == dims
-            # Cache hit - return cached view
-            return @inbounds tp.views[cache_idx]::CuArray{T, N}
-        end
-    end
+@inline function AdaptiveArrayPools._acquire_impl!(pool::CuAdaptiveArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N}
+    return _acquire_impl!(pool, T, dims...)
+end
 
-    # 3. Cache miss: create new view, use round-robin replacement
-    @inbounds vec = tp.vectors[idx]
-    current_len = length(vec)
-    if current_len != total_len
-        # Resize vector to match requested size (grow or shrink).
-        # Uses _resize_without_shrink! to avoid GPU reallocation on shrink.
-        _resize_without_shrink!(vec, total_len)
-        # CRITICAL: on grow, _resize_without_shrink! delegates to resize! which
-        # may reallocate the GPU buffer (pointer change). On shrink, pointer is
-        # stable but length changed. Either way, cached views are stale.
-        # Must invalidate ALL ways to prevent returning stale/dangling views.
-        for k in 1:CACHE_WAYS
-            @inbounds tp.views[base + k] = nothing
-            @inbounds tp.view_dims[base + k] = nothing
-        end
-        @inbounds tp.next_way[idx] = 1  # Reset round-robin
-    end
+"""
+    _unsafe_acquire_impl!(pool::CuAdaptiveArrayPool, T, dims...) -> CuArray{T,N}
 
-    new_view = view(vec, 1:total_len)
-    nd_view = N == 1 ? new_view : reshape(new_view, dims)
+CUDA override: same as `_acquire_impl!` — both return CuArray via `get_array!`.
+"""
+@inline function AdaptiveArrayPools._unsafe_acquire_impl!(pool::CuAdaptiveArrayPool, ::Type{T}, n::Int) where {T}
+    tp = get_typed_pool!(pool, T)
+    result = get_array!(tp, (n,))
+    _maybe_record_borrow!(pool, tp)
+    return result
+end
 
-    # Round-robin replacement (or first way if just flushed)
-    @inbounds way = tp.next_way[idx]
-    cache_idx = base + way
-    @inbounds tp.views[cache_idx] = nd_view
-    @inbounds tp.view_dims[cache_idx] = dims
-    @inbounds tp.next_way[idx] = (way % CACHE_WAYS) + 1
+@inline function AdaptiveArrayPools._unsafe_acquire_impl!(pool::CuAdaptiveArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N}
+    tp = get_typed_pool!(pool, T)
+    result = get_array!(tp, dims)
+    _maybe_record_borrow!(pool, tp)
+    return result
+end
 
-    return nd_view
+@inline function AdaptiveArrayPools._unsafe_acquire_impl!(pool::CuAdaptiveArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N}
+    return _unsafe_acquire_impl!(pool, T, dims...)
 end
 
 # ==============================================================================
-# CUDA-Specific get_array! - Delegates to unified get_view!
+# get_view! / get_array! — arr_wrappers + setfield! Based Zero-Alloc
 # ==============================================================================
+# get_view! delegates to get_array! for backward compat (e.g., direct get_view! calls).
+# The main acquire path now bypasses get_view! entirely via _acquire_impl! above.
+
+@inline function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, n::Int) where {T}
+    return get_array!(tp, (n,))
+end
+
+@inline function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N}
+    return get_array!(tp, dims)
+end
 
 """
     get_array!(tp::CuTypedPool{T}, dims::NTuple{N,Int}) -> CuArray{T,N}
 
-Delegates to `get_view!(tp, dims)` for unified caching.
-Used by `unsafe_acquire!` - same zero-allocation behavior as `acquire!`.
+Get an N-dimensional `CuArray` from the pool with `setfield!`-based wrapper reuse.
+
+## Cache Hit (common case, 0-alloc)
+1. Look up `arr_wrappers[N][slot]`
+2. Check `wrapper.data.rc !== vec.data.rc` — if same GPU buffer, just `setfield!(:dims)`
+3. If different (rare: only after grow beyond capacity), update `:data` via refcount management
+
+## Cache Miss (first call per (slot, N))
+Creates CuArray wrapper sharing backing vector's GPU memory via `copy(vec.data)`,
+stores in `arr_wrappers[N][slot]` via `_store_arr_wrapper!` (reuses base module helper).
 """
 @inline function AdaptiveArrayPools.get_array!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N}
-    return get_view!(tp, dims)
+    total_len = safe_prod(dims)
+    slot = _cuda_claim_slot!(tp, total_len)
+    @inbounds vec = tp.vectors[slot]
+
+    # arr_wrappers lookup (direct index, no hash — same as CPU path)
+    wrappers = N <= length(tp.arr_wrappers) ? (@inbounds tp.arr_wrappers[N]) : nothing
+    if wrappers !== nothing && slot <= length(wrappers)
+        wrapper = @inbounds wrappers[slot]
+        if wrapper !== nothing
+            cu = wrapper::CuArray{T, N}
+            # Check if backing vec's GPU buffer changed (rare: only on grow beyond capacity)
+            if cu.data.rc !== vec.data.rc
+                _update_cuda_wrapper_data!(cu, vec)
+            end
+            setfield!(cu, :dims, dims)
+            return cu
+        end
+    end
+
+    # Cache miss: create wrapper sharing vec's GPU memory
+    cu = CuArray{T, N}(copy(vec.data), dims; maxsize = vec.maxsize, offset = 0)
+    _store_arr_wrapper!(tp, N, slot, cu)
+    return cu
+end
+
+# ==============================================================================
+# _reshape_impl! for CuArray — Zero-Alloc Reshape
+# ==============================================================================
+
+"""
+    _reshape_impl!(pool::CuAdaptiveArrayPool, A::CuArray{T,M}, dims::NTuple{N,Int}) -> CuArray{T,N}
+
+Zero-allocation reshape for CuArray using `setfield!`-based wrapper reuse.
+
+- **Same dimensionality (M == N)**: `setfield!(A, :dims, dims)` — no pool interaction
+- **Different dimensionality (M ≠ N)**: Claims a pool slot, reuses cached `CuArray{T,N}`
+  wrapper with `setfield!(:dims)` pointing to `A`'s GPU memory.
+"""
+@inline function AdaptiveArrayPools._reshape_impl!(
+        pool::CuAdaptiveArrayPool, A::CuArray{T, M}, dims::NTuple{N, Int}
+    ) where {T, M, N}
+    for d in dims
+        d < 0 && throw(ArgumentError("invalid CuArray dimensions"))
+    end
+    total_len = safe_prod(dims)
+    length(A) == total_len || throw(
+        DimensionMismatch(
+            "new dimensions $(dims) must be consistent with array length $(length(A))"
+        )
+    )
+
+    # 0-D reshape: rare edge case, delegate to Base (arr_wrappers is 1-indexed by N)
+    N == 0 && return reshape(A, dims)
+
+    # Same dimensionality: just update dims in-place, no pool interaction
+    if M == N
+        setfield!(A, :dims, dims)
+        return A
+    end
+
+    # Different dimensionality: claim slot + reuse cached N-D wrapper
+    tp = AdaptiveArrayPools.get_typed_pool!(pool, T)
+    _record_type_touch!(pool, T)
+    slot = _cuda_claim_slot!(tp)
+
+    # Look up cached wrapper (direct index, no hash)
+    wrappers = N <= length(tp.arr_wrappers) ? (@inbounds tp.arr_wrappers[N]) : nothing
+    if wrappers !== nothing && slot <= length(wrappers)
+        wrapper = @inbounds wrappers[slot]
+        if wrapper !== nothing
+            cu = wrapper::CuArray{T, N}
+            if cu.data.rc !== A.data.rc
+                _update_cuda_wrapper_data!(cu, A)
+            end
+            setfield!(cu, :dims, dims)
+            return cu
+        end
+    end
+
+    # Cache miss (first call per slot+N): create wrapper, cache forever
+    cu = CuArray{T, N}(copy(A.data), dims; maxsize = A.maxsize, offset = A.offset)
+    _store_arr_wrapper!(tp, N, slot, cu)
+    return cu
 end
 
 # ==============================================================================
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/debug.jl b/ext/AdaptiveArrayPoolsCUDAExt/debug.jl
index 6dbb6a92..5c6c0405 100644
--- a/ext/AdaptiveArrayPoolsCUDAExt/debug.jl
+++ b/ext/AdaptiveArrayPoolsCUDAExt/debug.jl
@@ -6,18 +6,19 @@
 # Safety levels on CUDA differ from CPU:
 # - Level 0: Zero overhead (all branches dead-code-eliminated)
 # - Level 1: Poisoning (NaN/sentinel fill) + structural invalidation via
-#            _resize_without_shrink!(vec, 0) + N-way cache invalidation
+#            _resize_to_fit!(vec, 0) + arr_wrappers invalidation (setfield!(:dims, zeros))
 # - Level 2: Poisoning + escape detection (_validate_pool_return for CuArrays)
 # - Level 3: Full + borrow call-site registry + debug messages
 #
 # Key difference: CPU uses resize!(v, 0) at Level 1 to invalidate stale SubArrays.
 # On CUDA, resize!(CuVector, 0) would free GPU memory, so we use
-# _resize_without_shrink!(vec, 0) instead — sets dims to (0,) while preserving
+# _resize_to_fit!(vec, 0) instead — sets dims to (0,) while preserving
 # the GPU allocation (maxsize). Poisoning fills sentinel data before the shrink.
+# arr_wrappers are invalidated by setting wrapper dims to zeros (matches CPU pattern).
 
 using AdaptiveArrayPools: _safety_level, _validate_pool_return,
     _set_pending_callsite!, _maybe_record_borrow!,
-    _invalidate_released_slots!,
+    _invalidate_released_slots!, _zero_dims_tuple,
     _throw_pool_escape_error,
     POOL_DEBUG, POOL_SAFETY_LV,
     PoolRuntimeEscapeError
@@ -49,26 +50,29 @@ end
 #
 # Overrides the no-op fallback in base. On CUDA:
 # - Level 0: no-op (base _rewind_typed_pool! gates with S >= 1, so never called)
-# - Level 1+: poison released CuVectors + invalidate N-way view cache
-# - NO resize!(cuv, 0) — would free GPU memory
+# - Level 1+: poison released CuVectors + invalidate arr_wrappers
+# - NO resize!(cuv, 0) — would free GPU memory; use _resize_to_fit! instead
 
 @noinline function AdaptiveArrayPools._invalidate_released_slots!(
         tp::CuTypedPool{T}, old_n_active::Int, S::Int
     ) where {T}
     new_n = tp.n_active
+    # Poison released CuVectors + shrink logical length to 0
     for i in (new_n + 1):old_n_active
-        # Poison released CuVectors with sentinel values
         _cuda_poison_fill!(@inbounds tp.vectors[i])
-        # Shrink logical length to 0 (GPU memory preserved via _resize_without_shrink!).
+        # Shrink logical length to 0 (GPU memory preserved via _resize_to_fit!).
         # Matches CPU behavior where resize!(vec, 0) invalidates SubArray references.
-        _resize_without_shrink!(@inbounds(tp.vectors[i]), 0)
-        # Invalidate N-way cache entries for released slots.
-        # After poisoning, cached views point at poisoned data — clear them so
-        # re-acquire creates fresh views instead of returning stale poisoned ones.
-        base = (i - 1) * CACHE_WAYS
-        for k in 1:CACHE_WAYS
-            @inbounds tp.views[base + k] = nothing
-            @inbounds tp.view_dims[base + k] = nothing
+        _resize_to_fit!(@inbounds(tp.vectors[i]), 0)
+    end
+    # Invalidate arr_wrappers for released slots (matches CPU pattern from src/state.jl)
+    for N_idx in 1:length(tp.arr_wrappers)
+        wrappers_for_N = @inbounds tp.arr_wrappers[N_idx]
+        wrappers_for_N === nothing && continue
+        wrappers = wrappers_for_N::Vector{Any}
+        for i in (new_n + 1):min(old_n_active, length(wrappers))
+            wrapper = @inbounds wrappers[i]
+            wrapper === nothing && continue
+            setfield!(wrapper::CuArray, :dims, _zero_dims_tuple(N_idx))
         end
     end
     return nothing
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/state.jl b/ext/AdaptiveArrayPoolsCUDAExt/state.jl
index 80a28ee2..be5b5dae 100644
--- a/ext/AdaptiveArrayPoolsCUDAExt/state.jl
+++ b/ext/AdaptiveArrayPoolsCUDAExt/state.jl
@@ -316,9 +316,7 @@ CUDA.reclaim()
 """
 function Base.empty!(tp::CuTypedPool)
     empty!(tp.vectors)
-    empty!(tp.views)
-    empty!(tp.view_dims)
-    empty!(tp.next_way)
+    empty!(tp.arr_wrappers)
     tp.n_active = 0
     # Restore sentinel values
     empty!(tp._checkpoint_n_active)
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/types.jl b/ext/AdaptiveArrayPoolsCUDAExt/types.jl
index 90f40741..b4f03b6c 100644
--- a/ext/AdaptiveArrayPoolsCUDAExt/types.jl
+++ b/ext/AdaptiveArrayPoolsCUDAExt/types.jl
@@ -3,44 +3,38 @@
 # ==============================================================================
 
 # Note: Unlike CPU, view(CuVector, 1:n) returns CuVector (via GPUArrays derive()),
-# NOT SubArray. However, we still cache view objects to avoid CPU heap allocation
-# (~80 bytes per call) for the CuVector metadata wrapper.
-
-# Note: Uses shared CACHE_WAYS constant from main module for consistency.
+# NOT SubArray. GPU view/reshape creation allocates ~80 bytes on CPU heap for the
+# CuArray wrapper. We cache wrappers via arr_wrappers to achieve zero-allocation
+# on cache hit (same approach as CPU's setfield!-based Array wrapper reuse).
 
 """
     CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}}
 
-GPU memory pool for element type `T`. Uses unified N-way view caching for all dimensions.
+GPU memory pool for element type `T`. Uses `arr_wrappers`-based CuArray reuse
+for zero-allocation acquire (same design as CPU TypedPool on Julia 1.11+).
 
 ## Fields
 - `vectors`: Backing `CuVector{T}` storage (one per slot)
-- `views`: Flat N-way cache storing CuArray of any dimension
-  - Layout: `views[(slot-1)*CACHE_WAYS + way]` for way ∈ 1:CACHE_WAYS
-- `view_dims`: Cached dims corresponding to views
-- `next_way`: Round-robin counter per slot for cache replacement
+- `arr_wrappers`: `Vector{Union{Nothing, Vector{Any}}}` — indexed by N (dimensionality),
+  each entry is a per-slot cached `CuArray{T,N}` wrapper. Uses `setfield!(wrapper, :dims, dims)`
+  for zero-allocation reuse of unlimited dimension patterns within the same N.
+  When the backing vector's GPU buffer changes (rare: only on grow beyond capacity),
+  the wrapper's `:data` field is updated via DataRef refcount management.
 - State management fields (same as CPU)
 
 ## Design Note
-Unlike CPU where view() returns SubArray and reshape() returns ReshapedArray,
-CUDA returns CuArray for both operations. This allows a unified cache that
-stores CuArray{T,N} for any N, eliminating the need for separate 1D/N-D caches.
-
-GPU view/reshape creation allocates ~80 bytes on CPU heap for the CuArray
-wrapper object. N-way caching with for-loop lookup eliminates this allocation
-when the same dimensions pattern is requested again.
+Unlike CPU where `setfield!(:ref, MemoryRef)` is free (GC-managed),
+CuArray's `:data` field is `DataRef` with manual refcounting. We minimize this cost
+via `wrapper.data.rc !== vec.data.rc` identity check (~2ns): only update `:data`
+when the backing vector's GPU buffer actually changed. The common case (same buffer)
+is a simple `setfield!(:dims)` — truly zero-allocation.
 """
 mutable struct CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}}
     # --- Storage ---
     vectors::Vector{CuVector{T}}
 
-    # --- Unified N-Way View Cache (flat layout) ---
-    # Length = n_slots * CACHE_WAYS
-    views::Vector{Any}       # CuArray{T,N} for any N
-    view_dims::Vector{Any}   # NTuple{N,Int} or nothing
-
-    # --- Cache Replacement (round-robin per slot) ---
-    next_way::Vector{Int}    # next_way[slot] ∈ 1:CACHE_WAYS
+    # --- N-D Wrapper Cache (setfield!-based reuse, matches CPU TypedPool) ---
+    arr_wrappers::Vector{Union{Nothing, Vector{Any}}}  # index=N (dimensionality), value=per-slot CuArray{T,N}
 
     # --- State Management (1-based sentinel pattern) ---
     n_active::Int
@@ -50,11 +44,9 @@ end
 
 function CuTypedPool{T}() where {T}
     return CuTypedPool{T}(
-        CuVector{T}[],      # vectors
-        Any[],              # views (N-way flat cache)
-        Any[],              # view_dims
-        Int[],              # next_way (round-robin counters)
-        0, [0], [0]         # State (1-based sentinel)
+        CuVector{T}[],                   # vectors
+        Union{Nothing, Vector{Any}}[],   # arr_wrappers (indexed by N)
+        0, [0], [0]                      # State (1-based sentinel)
     )
 end
 
diff --git a/src/legacy/state.jl b/src/legacy/state.jl
index 724c53d2..7d6d1868 100644
--- a/src/legacy/state.jl
+++ b/src/legacy/state.jl
@@ -4,6 +4,17 @@
 # Identical to v1.11+ state.jl except empty! clears legacy N-way cache fields
 # (nd_arrays, nd_dims, nd_ptrs, nd_next_way) instead of nd_wrappers.
 
+# Zero-dims tuple for wrapper invalidation. Literal tuples for N ≤ 4 avoid
+# ntuple(_ -> 0, N) dynamic-dispatch allocation (runtime N → heterogeneous
+# return type → boxing). Falls back to ntuple for N > 4 (extremely rare).
+@inline function _zero_dims_tuple(N::Int)
+    N == 1 && return (0,)
+    N == 2 && return (0, 0)
+    N == 3 && return (0, 0, 0)
+    N == 4 && return (0, 0, 0, 0)
+    return ntuple(_ -> 0, N)
+end
+
 # ==============================================================================
 # State Management - checkpoint!
 # ==============================================================================
@@ -300,7 +311,7 @@ end
             dims === nothing && continue
             N = length(dims::Tuple)
             setfield!(ba::BitArray, :len, 0)
-            setfield!(ba::BitArray, :dims, ntuple(_ -> 0, N))
+            setfield!(ba::BitArray, :dims, _zero_dims_tuple(N))
         end
     end
     return nothing
diff --git a/src/state.jl b/src/state.jl
index 0d6331b6..9903c0b9 100644
--- a/src/state.jl
+++ b/src/state.jl
@@ -244,6 +244,17 @@ end
 _invalidate_released_slots!(::AbstractTypedPool, ::Int, ::Int) = nothing
 _invalidate_released_slots!(::AbstractTypedPool, ::Int) = nothing  # legacy 2-arg compat
 
+# Zero-dims tuple for wrapper invalidation. Literal tuples for N ≤ 4 avoid
+# ntuple(_ -> 0, N) dynamic-dispatch allocation (runtime N → heterogeneous
+# return type → boxing). Falls back to ntuple for N > 4 (extremely rare).
+@inline function _zero_dims_tuple(N::Int)
+    N == 1 && return (0,)
+    N == 2 && return (0, 0)
+    N == 3 && return (0, 0, 0)
+    N == 4 && return (0, 0, 0, 0)
+    return ntuple(_ -> 0, N)
+end
+
 @noinline function _invalidate_released_slots!(tp::TypedPool{T}, old_n_active::Int, S::Int = POOL_SAFETY_LV[]) where {T}
     new_n = tp.n_active
     # Level 2+: poison vectors with NaN/sentinel before structural invalidation
@@ -262,7 +273,7 @@ _invalidate_released_slots!(::AbstractTypedPool, ::Int) = nothing  # legacy 2-ar
         for i in (new_n + 1):min(old_n_active, length(wrappers))
             wrapper = @inbounds wrappers[i]
             wrapper === nothing && continue
-            setfield!(wrapper::Array, :size, ntuple(_ -> 0, N_idx))
+            setfield!(wrapper::Array, :size, _zero_dims_tuple(N_idx))
         end
     end
     return nothing
@@ -288,7 +299,7 @@ end
             wrapper === nothing && continue
             ba = wrapper::BitArray
             setfield!(ba, :len, 0)
-            setfield!(ba, :dims, ntuple(_ -> 0, N_idx))
+            setfield!(ba, :dims, _zero_dims_tuple(N_idx))
         end
     end
     return nothing
diff --git a/test/cuda/test_allocation.jl b/test/cuda/test_allocation.jl
index 7a5ff1b1..974b3f90 100644
--- a/test/cuda/test_allocation.jl
+++ b/test/cuda/test_allocation.jl
@@ -169,53 +169,68 @@
 
 end
 
-@testset "_resize_without_shrink! GPU memory preservation" begin
-    _resize_without_shrink! = ext._resize_without_shrink!
+@testset "_resize_to_fit! GPU memory preservation" begin
+    _resize_to_fit! = ext._resize_to_fit!
 
     @testset "Shrink preserves GPU pointer" begin
         v = CUDA.zeros(Float32, 1000)
         ptr = UInt(pointer(v))
-        _resize_without_shrink!(v, 100)
+        _resize_to_fit!(v, 100)
         @test length(v) == 100
         @test UInt(pointer(v)) == ptr
     end
 
-    @testset "Grow-back within maxsize: no realloc" begin
+    @testset "Grow-back within capacity: no realloc" begin
         v = CUDA.zeros(Float32, 1000)
         ptr = UInt(pointer(v))
         # Shrink first
-        _resize_without_shrink!(v, 100)
+        _resize_to_fit!(v, 100)
         @test length(v) == 100
         @test UInt(pointer(v)) == ptr
         # Grow back to original size — maxsize preserved, so no GPU realloc
-        _resize_without_shrink!(v, 1000)
+        _resize_to_fit!(v, 1000)
         @test length(v) == 1000
         @test UInt(pointer(v)) == ptr
     end
 
-    @testset "Shrink to 0 preserves pointer" begin
+    @testset "Shrink to 0, grow back preserves pointer" begin
         v = CUDA.zeros(Float32, 500)
         ptr = UInt(pointer(v))
-        _resize_without_shrink!(v, 0)
+        _resize_to_fit!(v, 0)
         @test length(v) == 0
         # GPU memory still allocated (not freed)
-        # Grow back from 0
-        _resize_without_shrink!(v, 500)
+        # Grow back from 0 — within capacity, so no GPU realloc
+        _resize_to_fit!(v, 500)
         @test length(v) == 500
         @test UInt(pointer(v)) == ptr
     end
 
+    @testset "Grow within capacity after invalidation: no realloc" begin
+        # This is the key test: after safety invalidation (dims→0),
+        # re-acquire within original capacity should NOT trigger GPU realloc.
+        # (CUDA.jl v5.9.x resize! would always reallocate; _resize_to_fit! avoids this)
+        v = CUDA.zeros(Float32, 1000)
+        ptr = UInt(pointer(v))
+        # Simulate safety invalidation
+        _resize_to_fit!(v, 0)
+        @test length(v) == 0
+        # Re-acquire at smaller size (still within original capacity)
+        _resize_to_fit!(v, 200)
+        @test length(v) == 200
+        @test UInt(pointer(v)) == ptr  # Same GPU buffer
+    end
+
     @testset "No-op when n == length" begin
         v = CUDA.zeros(Float32, 200)
         ptr = UInt(pointer(v))
-        _resize_without_shrink!(v, 200)
+        _resize_to_fit!(v, 200)
         @test length(v) == 200
         @test UInt(pointer(v)) == ptr
     end
 
-    @testset "Grow beyond maxsize delegates to resize!" begin
+    @testset "Grow beyond capacity delegates to resize!" begin
         v = CUDA.zeros(Float32, 100)
-        _resize_without_shrink!(v, 10_000)
+        _resize_to_fit!(v, 10_000)
         @test length(v) == 10_000
         # Pointer may change (new allocation) — just verify length is correct
     end
diff --git a/test/cuda/test_cuda_safety.jl b/test/cuda/test_cuda_safety.jl
index a91e5d47..7df427f3 100644
--- a/test/cuda/test_cuda_safety.jl
+++ b/test/cuda/test_cuda_safety.jl
@@ -56,7 +56,7 @@ _cuda_test_leak(x) = x
     # ==============================================================================
     # Level 1: Poisoning + structural invalidation (length → 0)
     # ==============================================================================
-    # CUDA Level 1 now: poison fill → _resize_without_shrink!(vec, 0)
+    # CUDA Level 1 now: poison fill → _resize_to_fit!(vec, 0) + arr_wrappers invalidation
     # Backing vector length becomes 0 (GPU memory preserved via maxsize).
     # Poison data persists in GPU memory and is visible on re-acquire (grow-back).
 
@@ -141,17 +141,22 @@ _cuda_test_leak(x) = x
         rewind!(pool)
     end
 
-    @testset "Level 1: N-way cache invalidated on poisoned rewind" begin
+    @testset "Level 1: arr_wrappers invalidated on poisoned rewind" begin
         pool = _make_cuda_pool(1)
         checkpoint!(pool)
         v = acquire!(pool, Float32, 10)
         CUDA.fill!(v, 1.0f0)
         rewind!(pool)
 
-        # Cached views should be cleared (nothing) after poisoning
-        base = 0 * ext.CACHE_WAYS
-        for k in 1:ext.CACHE_WAYS
-            @test pool.float32.views[base + k] === nothing
+        # arr_wrappers for released slots should have zero-dims after invalidation
+        tp = pool.float32
+        for N_idx in 1:length(tp.arr_wrappers)
+            wrappers_for_N = tp.arr_wrappers[N_idx]
+            wrappers_for_N === nothing && continue
+            for wrapper in wrappers_for_N
+                wrapper === nothing && continue
+                @test all(==(0), size(wrapper))
+            end
         end
     end
 
diff --git a/test/cuda/test_extension.jl b/test/cuda/test_extension.jl
index 01753a3e..ba6c8df4 100644
--- a/test/cuda/test_extension.jl
+++ b/test/cuda/test_extension.jl
@@ -6,10 +6,8 @@
         tp_fields = fieldnames(CuTypedPool)
         @test :vectors in tp_fields
         @test :n_active in tp_fields
-        # N-way cache fields
-        @test :views in tp_fields
-        @test :view_dims in tp_fields
-        @test :next_way in tp_fields  # Round-robin counter
+        # arr_wrappers (setfield!-based wrapper reuse, replaces N-way cache)
+        @test :arr_wrappers in tp_fields
         # State management
         @test :_checkpoint_n_active in tp_fields
         @test :_checkpoint_depths in tp_fields
diff --git a/test/cuda/test_nway_cache.jl b/test/cuda/test_nway_cache.jl
index 526c5e5c..7c76f7a0 100644
--- a/test/cuda/test_nway_cache.jl
+++ b/test/cuda/test_nway_cache.jl
@@ -1,8 +1,10 @@
-# CUDA N-way Cache Tests
-# Verifies N-way cache behavior for CuArray wrapper reuse
-# Key: 4-way cache means 4 dimension patterns = zero-alloc, 5+ = allocation
+# CUDA arr_wrappers Tests
+# Verifies setfield!-based CuArray wrapper reuse for zero-allocation acquire.
+# Key: arr_wrappers[N][slot] caches one CuArray{T,N} per (dimensionality N, slot).
+# Same N = setfield!(:dims) → zero-alloc for unlimited dimension patterns.
+# Different N = separate wrapper per N.
 
-@testset "N-way Cache Types" begin
+@testset "CuArray Wrapper Types" begin
 
     @testset "acquire! returns CuArray" begin
         @with_pool :cuda pool begin
@@ -28,32 +30,24 @@
         end
     end
 
-    @testset "CACHE_WAYS configuration" begin
-        # CACHE_WAYS is defined in CUDA extension (CPU ≥1.11 no longer uses view caching)
-        CUDAExt = Base.get_extension(AdaptiveArrayPools, :AdaptiveArrayPoolsCUDAExt)
-        @test CUDAExt.CACHE_WAYS isa Int
-        @test 1 <= CUDAExt.CACHE_WAYS <= 16
-    end
-
 end
 
-@testset "N-way Cache Behavior" begin
+@testset "arr_wrappers: Unlimited Same-N Patterns" begin
 
-    # Key principles:
-    # 1. GPU allocation should ALWAYS be 0 (memory reused from pool)
-    # 2. CPU allocation: cache hit (4-way) = 0, cache miss (5-way) = >0
+    # With arr_wrappers, same-N dimension patterns use setfield!(:dims).
+    # Unlike the old 4-way cache, there is NO eviction — unlimited patterns per N.
 
     # =========================================================================
     # GPU Allocation Tests (with fill! to actually use the arrays)
     # =========================================================================
 
-    @testset "GPU: 4-way zero-alloc" begin
+    @testset "GPU: 4 patterns zero-alloc" begin
         pool = get_task_local_cuda_pool()
         reset!(pool)
 
         dims_list = ((10, 10), (5, 20), (20, 5), (4, 25))
 
-        function test_4way_gpu()
+        function test_4pat_gpu()
             for dims in dims_list
                 @with_pool :cuda p begin
                     A = acquire!(p, Float64, dims...)
@@ -63,21 +57,21 @@ end
         end
 
         # Warmup
-        test_4way_gpu()
-        test_4way_gpu()
+        test_4pat_gpu()
+        test_4pat_gpu()
         GC.gc(); CUDA.reclaim()
 
-        gpu_alloc = CUDA.@allocated test_4way_gpu()
+        gpu_alloc = CUDA.@allocated test_4pat_gpu()
         @test gpu_alloc == 0
     end
 
-    @testset "GPU: 5-way zero-alloc (even with cache miss)" begin
+    @testset "GPU: 5+ patterns zero-alloc (no eviction with arr_wrappers)" begin
         pool = get_task_local_cuda_pool()
         reset!(pool)
 
         dims_list = ((10, 10), (5, 20), (20, 5), (4, 25), (2, 50))
 
-        function test_5way_gpu()
+        function test_5pat_gpu()
             for dims in dims_list
                 @with_pool :cuda p begin
                     A = acquire!(p, Float64, dims...)
@@ -87,11 +81,11 @@ end
         end
 
         # Warmup
-        test_5way_gpu()
-        test_5way_gpu()
+        test_5pat_gpu()
+        test_5pat_gpu()
         GC.gc(); CUDA.reclaim()
 
-        gpu_alloc = CUDA.@allocated test_5way_gpu()
+        gpu_alloc = CUDA.@allocated test_5pat_gpu()
         @test gpu_alloc == 0
     end
 
@@ -99,13 +93,13 @@ end
     # CPU Allocation Tests (no fill! to avoid CUDA kernel overhead)
     # =========================================================================
 
-    @testset "CPU: 4-way zero-alloc (cache hit)" begin
+    @testset "CPU: 4 patterns zero-alloc" begin
         pool = get_task_local_cuda_pool()
         reset!(pool)
 
         dims_list = ((10, 10), (5, 20), (20, 5), (4, 25))
 
-        function test_4way_cpu()
+        function test_4pat_cpu()
             for dims in dims_list
                 @with_pool :cuda p begin
                     _ = acquire!(p, Float64, dims...)
@@ -114,21 +108,22 @@ end
         end
 
         # Warmup
-        test_4way_cpu()
-        test_4way_cpu()
+        test_4pat_cpu()
+        test_4pat_cpu()
         GC.gc()
 
-        cpu_alloc = @allocated test_4way_cpu()
-        @test cpu_alloc == 0  # 4 patterns fit in 4-way cache
+        cpu_alloc = @allocated test_4pat_cpu()
+        @test cpu_alloc == 0
     end
 
-    @testset "CPU: 5-way causes allocation (cache miss)" begin
+    @testset "CPU: 5+ patterns zero-alloc (arr_wrappers: same-N uses setfield!)" begin
         pool = get_task_local_cuda_pool()
         reset!(pool)
 
+        # All 2D — same N=2 → single wrapper per slot, setfield!(:dims) only
         dims_list = ((10, 10), (5, 20), (20, 5), (4, 25), (2, 50))
 
-        function test_5way_cpu()
+        function test_5pat_cpu()
             for dims in dims_list
                 @with_pool :cuda p begin
                     _ = acquire!(p, Float64, dims...)
@@ -137,25 +132,25 @@ end
         end
 
         # Warmup
-        test_5way_cpu()
-        test_5way_cpu()
+        test_5pat_cpu()
+        test_5pat_cpu()
         GC.gc()
 
-        cpu_alloc = @allocated test_5way_cpu()
-        @test cpu_alloc > 0  # 5 patterns exceed 4-way cache
+        cpu_alloc = @allocated test_5pat_cpu()
+        @test cpu_alloc == 0  # No eviction — arr_wrappers support unlimited same-N patterns
     end
 
     # =========================================================================
     # unsafe_acquire! Tests
     # =========================================================================
 
-    @testset "unsafe_acquire! GPU: 4-way zero-alloc" begin
+    @testset "unsafe_acquire! GPU: 4 patterns zero-alloc" begin
         pool = get_task_local_cuda_pool()
         reset!(pool)
 
         dims_list = ((8, 8), (4, 16), (16, 4), (2, 32))
 
-        function test_unsafe_4way_gpu()
+        function test_unsafe_4pat_gpu()
             for dims in dims_list
                 @with_pool :cuda p begin
                     A = unsafe_acquire!(p, Float64, dims...)
@@ -165,21 +160,21 @@ end
         end
 
         # Warmup
-        test_unsafe_4way_gpu()
-        test_unsafe_4way_gpu()
+        test_unsafe_4pat_gpu()
+        test_unsafe_4pat_gpu()
         GC.gc(); CUDA.reclaim()
 
-        gpu_alloc = CUDA.@allocated test_unsafe_4way_gpu()
+        gpu_alloc = CUDA.@allocated test_unsafe_4pat_gpu()
         @test gpu_alloc == 0
     end
 
-    @testset "unsafe_acquire! CPU: 4-way zero-alloc" begin
+    @testset "unsafe_acquire! CPU: 4 patterns zero-alloc" begin
         pool = get_task_local_cuda_pool()
         reset!(pool)
 
         dims_list = ((8, 8), (4, 16), (16, 4), (2, 32))
 
-        function test_unsafe_4way_cpu()
+        function test_unsafe_4pat_cpu()
             for dims in dims_list
                 @with_pool :cuda p begin
                     _ = unsafe_acquire!(p, Float64, dims...)
@@ -188,21 +183,21 @@ end
         end
 
         # Warmup
-        test_unsafe_4way_cpu()
-        test_unsafe_4way_cpu()
+        test_unsafe_4pat_cpu()
+        test_unsafe_4pat_cpu()
         GC.gc()
 
-        cpu_alloc = @allocated test_unsafe_4way_cpu()
+        cpu_alloc = @allocated test_unsafe_4pat_cpu()
         @test cpu_alloc == 0
     end
 
-    @testset "unsafe_acquire! CPU: 5-way causes allocation" begin
+    @testset "unsafe_acquire! CPU: 5+ patterns zero-alloc" begin
         pool = get_task_local_cuda_pool()
         reset!(pool)
 
         dims_list = ((8, 8), (4, 16), (16, 4), (2, 32), (32, 2))
 
-        function test_unsafe_5way_cpu()
+        function test_unsafe_5pat_cpu()
             for dims in dims_list
                 @with_pool :cuda p begin
                     _ = unsafe_acquire!(p, Float64, dims...)
@@ -211,17 +206,248 @@ end
         end
 
         # Warmup
-        test_unsafe_5way_cpu()
-        test_unsafe_5way_cpu()
+        test_unsafe_5pat_cpu()
+        test_unsafe_5pat_cpu()
+        GC.gc()
+
+        cpu_alloc = @allocated test_unsafe_5pat_cpu()
+        @test cpu_alloc == 0  # arr_wrappers: unlimited same-N patterns
+    end
+
+end
+
+@testset "arr_wrappers: Mixed-N Patterns (1D + 2D + 3D)" begin
+
+    # arr_wrappers[N][slot] caches a separate wrapper per dimensionality N.
+    # Same slot, different N → each N gets its own wrapper (first use = cache miss).
+    # After warmup of all (slot, N) combos → zero-alloc for any mix.
+
+    # =========================================================================
+    # GPU Allocation Tests
+    # =========================================================================
+
+    @testset "GPU: 1D + 2D + 3D mixed acquire zero-alloc" begin
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        # Each @with_pool scope resets slot counter → slot 1 gets 1D/2D/3D wrappers
+        function test_mixed_n_gpu()
+            @with_pool :cuda p begin
+                v = acquire!(p, Float64, 100)      # 1D — arr_wrappers[1][1]
+                fill!(v, 1.0)
+            end
+            @with_pool :cuda p begin
+                A = acquire!(p, Float64, 10, 10)    # 2D — arr_wrappers[2][1]
+                fill!(A, 2.0)
+            end
+            @with_pool :cuda p begin
+                T = acquire!(p, Float64, 5, 5, 4)   # 3D — arr_wrappers[3][1]
+                fill!(T, 3.0)
+            end
+        end
+
+        # Warmup (populates all 3 wrappers per N)
+        test_mixed_n_gpu()
+        test_mixed_n_gpu()
+        GC.gc(); CUDA.reclaim()
+
+        gpu_alloc = CUDA.@allocated test_mixed_n_gpu()
+        @test gpu_alloc == 0
+    end
+
+    @testset "GPU: mixed-N with varying dims zero-alloc" begin
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        # Vary dims within each N across iterations
+        function test_mixed_n_varying_gpu()
+            for (d1, d2, d3) in (
+                    (100, (10, 10), (5, 5, 4)),
+                    (200, (5, 20), (4, 5, 10)),
+                    (50, (20, 5), (2, 5, 10)),
+                )
+                @with_pool :cuda p begin
+                    v = acquire!(p, Float64, d1)
+                    fill!(v, 1.0)
+                end
+                @with_pool :cuda p begin
+                    A = acquire!(p, Float64, d2...)
+                    fill!(A, 2.0)
+                end
+                @with_pool :cuda p begin
+                    T = acquire!(p, Float64, d3...)
+                    fill!(T, 3.0)
+                end
+            end
+        end
+
+        # Warmup
+        test_mixed_n_varying_gpu()
+        test_mixed_n_varying_gpu()
+        GC.gc(); CUDA.reclaim()
+
+        gpu_alloc = CUDA.@allocated test_mixed_n_varying_gpu()
+        @test gpu_alloc == 0
+    end
+
+    @testset "GPU: multi-slot mixed-N zero-alloc" begin
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        # Multiple acquires within one scope → different slots, different N
+        function test_multi_slot_mixed_n_gpu()
+            @with_pool :cuda p begin
+                v = acquire!(p, Float64, 100)        # Slot 1, 1D
+                A = acquire!(p, Float64, 10, 10)     # Slot 2, 2D
+                T = acquire!(p, Float64, 5, 5, 4)    # Slot 3, 3D
+                fill!(v, 1.0)
+                fill!(A, 2.0)
+                fill!(T, 3.0)
+            end
+        end
+
+        # Warmup
+        test_multi_slot_mixed_n_gpu()
+        test_multi_slot_mixed_n_gpu()
+        GC.gc(); CUDA.reclaim()
+
+        gpu_alloc = CUDA.@allocated test_multi_slot_mixed_n_gpu()
+        @test gpu_alloc == 0
+    end
+
+    # =========================================================================
+    # CPU Allocation Tests (no fill! — wrapper creation overhead only)
+    # =========================================================================
+
+    @testset "CPU: 1D + 2D + 3D mixed acquire zero-alloc" begin
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        function test_mixed_n_cpu()
+            @with_pool :cuda p begin
+                _ = acquire!(p, Float64, 100)        # 1D
+            end
+            @with_pool :cuda p begin
+                _ = acquire!(p, Float64, 10, 10)     # 2D
+            end
+            @with_pool :cuda p begin
+                _ = acquire!(p, Float64, 5, 5, 4)    # 3D
+            end
+        end
+
+        # Warmup
+        test_mixed_n_cpu()
+        test_mixed_n_cpu()
+        GC.gc()
+
+        cpu_alloc = @allocated test_mixed_n_cpu()
+        @test cpu_alloc == 0
+    end
+
+    @testset "CPU: mixed-N with varying dims zero-alloc" begin
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        function test_mixed_n_varying_cpu()
+            for (d1, d2, d3) in (
+                    (100, (10, 10), (5, 5, 4)),
+                    (200, (5, 20), (4, 5, 10)),
+                    (50, (20, 5), (2, 5, 10)),
+                )
+                @with_pool :cuda p begin
+                    _ = acquire!(p, Float64, d1)
+                end
+                @with_pool :cuda p begin
+                    _ = acquire!(p, Float64, d2...)
+                end
+                @with_pool :cuda p begin
+                    _ = acquire!(p, Float64, d3...)
+                end
+            end
+        end
+
+        # Warmup
+        test_mixed_n_varying_cpu()
+        test_mixed_n_varying_cpu()
+        GC.gc()
+
+        cpu_alloc = @allocated test_mixed_n_varying_cpu()
+        @test cpu_alloc == 0
+    end
+
+    @testset "CPU: multi-slot mixed-N zero-alloc" begin
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        function test_multi_slot_mixed_n_cpu()
+            @with_pool :cuda p begin
+                _ = acquire!(p, Float64, 100)        # Slot 1, 1D
+                _ = acquire!(p, Float64, 10, 10)     # Slot 2, 2D
+                _ = acquire!(p, Float64, 5, 5, 4)    # Slot 3, 3D
+            end
+        end
+
+        # Warmup
+        test_multi_slot_mixed_n_cpu()
+        test_multi_slot_mixed_n_cpu()
         GC.gc()
 
-        cpu_alloc = @allocated test_unsafe_5way_cpu()
-        @test cpu_alloc > 0
+        cpu_alloc = @allocated test_multi_slot_mixed_n_cpu()
+        @test cpu_alloc == 0
+    end
+
+    # =========================================================================
+    # unsafe_acquire! Mixed-N Tests
+    # =========================================================================
+
+    @testset "unsafe_acquire! GPU: mixed-N zero-alloc" begin
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        function test_unsafe_mixed_n_gpu()
+            @with_pool :cuda p begin
+                v = unsafe_acquire!(p, Float64, 100)
+                A = unsafe_acquire!(p, Float64, 10, 10)
+                T = unsafe_acquire!(p, Float64, 5, 5, 4)
+                fill!(v, 1.0)
+                fill!(A, 2.0)
+                fill!(T, 3.0)
+            end
+        end
+
+        # Warmup
+        test_unsafe_mixed_n_gpu()
+        test_unsafe_mixed_n_gpu()
+        GC.gc(); CUDA.reclaim()
+
+        gpu_alloc = CUDA.@allocated test_unsafe_mixed_n_gpu()
+        @test gpu_alloc == 0
+    end
+
+    @testset "unsafe_acquire! CPU: mixed-N zero-alloc" begin
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        function test_unsafe_mixed_n_cpu()
+            @with_pool :cuda p begin
+                _ = unsafe_acquire!(p, Float64, 100)
+                _ = unsafe_acquire!(p, Float64, 10, 10)
+                _ = unsafe_acquire!(p, Float64, 5, 5, 4)
+            end
+        end
+
+        # Warmup
+        test_unsafe_mixed_n_cpu()
+        test_unsafe_mixed_n_cpu()
+        GC.gc()
+
+        cpu_alloc = @allocated test_unsafe_mixed_n_cpu()
+        @test cpu_alloc == 0
     end
 
 end
 
-@testset "N-way Cache: Loop Patterns" begin
+@testset "arr_wrappers: Loop Patterns" begin
 
     @testset "100 iterations: GPU always zero-alloc" begin
         pool = get_task_local_cuda_pool()
@@ -229,7 +455,7 @@ end
 
         dims_list = ((10, 10), (5, 20), (20, 5), (4, 25))
 
-        function test_loop_4way()
+        function test_loop_4pat()
             for _ in 1:100
                 for dims in dims_list
                     @with_pool :cuda p begin
@@ -241,20 +467,20 @@ end
         end
 
         # Warmup
-        test_loop_4way()
+        test_loop_4pat()
         GC.gc(); CUDA.reclaim()
 
-        gpu_alloc = CUDA.@allocated test_loop_4way()
+        gpu_alloc = CUDA.@allocated test_loop_4pat()
         @test gpu_alloc == 0  # GPU memory always reused
     end
 
-    @testset "100 iterations with 5 patterns: GPU still zero-alloc" begin
+    @testset "100 iterations with 5+ patterns: GPU still zero-alloc" begin
         pool = get_task_local_cuda_pool()
         reset!(pool)
 
         dims_list = ((10, 10), (5, 20), (20, 5), (4, 25), (2, 50))
 
-        function test_loop_5way()
+        function test_loop_5pat()
             for _ in 1:100
                 for dims in dims_list
                     @with_pool :cuda p begin
@@ -266,16 +492,16 @@ end
         end
 
         # Warmup
-        test_loop_5way()
+        test_loop_5pat()
         GC.gc(); CUDA.reclaim()
 
-        gpu_alloc = CUDA.@allocated test_loop_5way()
-        @test gpu_alloc == 0  # GPU memory reused even with cache thrashing
+        gpu_alloc = CUDA.@allocated test_loop_5pat()
+        @test gpu_alloc == 0  # GPU memory reused — no cache eviction
     end
 
 end
 
-@testset "N-way Cache: Multiple Slots" begin
+@testset "arr_wrappers: Multiple Slots" begin
 
     @testset "Multiple arrays per iteration: GPU zero-alloc" begin
         pool = get_task_local_cuda_pool()
@@ -306,7 +532,6 @@ end
         reset!(pool)
 
         # Use same dims for both slots, just vary across iterations
-        # This tests GPU memory reuse, not cache behavior
         dims_list = ((10, 10), (5, 20), (20, 5), (4, 25))
 
         function test_multi_slot_varying()
@@ -331,7 +556,126 @@ end
 
 end
 
-@testset "N-way Cache: Resize Behavior" begin
+@testset "arr_wrappers: reshape! Zero-Alloc" begin
+
+    # _reshape_impl! for CuArray uses arr_wrappers cache for cross-dim reshape,
+    # and in-place setfield!(:dims) for same-dim reshape (no pool interaction).
+
+    @testset "GPU: cross-dim reshape zero-alloc" begin
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        function test_reshape_cross_dim_gpu()
+            @with_pool :cuda p begin
+                A = acquire!(p, Float64, 12)
+                CUDA.fill!(A, 1.0)
+                # 1D → 2D (cross-dim: claims slot, uses arr_wrappers[2])
+                B = reshape!(p, A, 3, 4)
+                CUDA.fill!(B, 2.0)
+            end
+        end
+
+        # Warmup
+        test_reshape_cross_dim_gpu()
+        test_reshape_cross_dim_gpu()
+        GC.gc(); CUDA.reclaim()
+
+        gpu_alloc = CUDA.@allocated test_reshape_cross_dim_gpu()
+        @test gpu_alloc == 0
+    end
+
+    @testset "CPU: cross-dim reshape zero-alloc" begin
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        function test_reshape_cross_dim_cpu()
+            @with_pool :cuda p begin
+                A = acquire!(p, Float64, 12)
+                B = reshape!(p, A, 3, 4)
+            end
+        end
+
+        test_reshape_cross_dim_cpu()
+        test_reshape_cross_dim_cpu()
+        GC.gc()
+
+        cpu_alloc = @allocated test_reshape_cross_dim_cpu()
+        @test cpu_alloc == 0
+    end
+
+    @testset "GPU: same-dim reshape zero-alloc" begin
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        function test_reshape_same_dim_gpu()
+            @with_pool :cuda p begin
+                A = acquire!(p, Float64, 3, 4)
+                CUDA.fill!(A, 1.0)
+                # 2D → 2D (same-dim: in-place setfield!, no pool interaction)
+                B = reshape!(p, A, 4, 3)
+                CUDA.fill!(B, 2.0)
+            end
+        end
+
+        test_reshape_same_dim_gpu()
+        test_reshape_same_dim_gpu()
+        GC.gc(); CUDA.reclaim()
+
+        gpu_alloc = CUDA.@allocated test_reshape_same_dim_gpu()
+        @test gpu_alloc == 0
+    end
+
+    @testset "GPU: mixed reshape sequence zero-alloc" begin
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        function test_reshape_mixed_gpu()
+            @with_pool :cuda p begin
+                A = acquire!(p, Float64, 24)
+                CUDA.fill!(A, 1.0)
+                B = reshape!(p, A, 4, 6)       # 1D → 2D
+                C = reshape!(p, A, 2, 3, 4)    # 1D → 3D
+                CUDA.fill!(B, 2.0)
+                CUDA.fill!(C, 3.0)
+            end
+        end
+
+        test_reshape_mixed_gpu()
+        test_reshape_mixed_gpu()
+        GC.gc(); CUDA.reclaim()
+
+        gpu_alloc = CUDA.@allocated test_reshape_mixed_gpu()
+        @test gpu_alloc == 0
+    end
+
+    @testset "Correctness: data sharing through reshape" begin
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        @with_pool :cuda p begin
+            A = acquire!(p, Float64, 12)
+            CUDA.fill!(A, 1.0)
+            B = reshape!(p, A, 3, 4)
+            @test size(B) == (3, 4)
+            @test B isa CuArray{Float64, 2}
+            # Data identity: B shares GPU memory with A
+            @test length(B) == length(A)
+        end
+    end
+
+    @testset "DimensionMismatch" begin
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        @with_pool :cuda p begin
+            A = acquire!(p, Float64, 12)
+            @test_throws DimensionMismatch reshape!(p, A, 5, 5)
+        end
+    end
+
+end
+
+@testset "arr_wrappers: Resize Behavior" begin
 
     @testset "Resize: GPU zero-alloc maintained" begin
         pool = get_task_local_cuda_pool()