ProjectTorreyPines · mgyoo86 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl
@@ -10,19 +10,6 @@ module AdaptiveArrayPoolsCUDAExt
 
 using AdaptiveArrayPools
 using AdaptiveArrayPools: AbstractTypedPool, AbstractArrayPool
-using Preferences: @load_preference, @set_preferences!
-
-# N-way view cache configuration (CUDA only — CPU ≥1.11 uses slot-first _claim_slot!).
-# GPU view/reshape allocates ~80 bytes on CPU heap, so caching still matters.
-const CACHE_WAYS = let
-    ways = @load_preference("cache_ways", 4)::Int
-    if ways < 1 || ways > 16
-        @warn "CACHE_WAYS=$ways out of range [1,16], using default 4"
-        4
-    else
-        ways
-    end
-end
 using CUDA
 
 # Type definitions
@@ -31,7 +18,7 @@ include("types.jl")
 # Dispatch methods (allocate_vector, wrap_array, get_typed_pool!)
 include("dispatch.jl")
 
-# GPU-specific get_view! implementation
+# GPU-specific acquire (arr_wrappers + setfield!, _resize_to_fit!, _reshape_impl!)
 include("acquire.jl")
 
 # Task-local pool (multi-device aware)

diff --git a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/debug.jl b/ext/AdaptiveArrayPoolsCUDAExt/debug.jl
@@ -6,18 +6,19 @@
 # Safety levels on CUDA differ from CPU:
 # - Level 0: Zero overhead (all branches dead-code-eliminated)
 # - Level 1: Poisoning (NaN/sentinel fill) + structural invalidation via
-#            _resize_without_shrink!(vec, 0) + N-way cache invalidation
+#            _resize_to_fit!(vec, 0) + arr_wrappers invalidation (setfield!(:dims, zeros))
 # - Level 2: Poisoning + escape detection (_validate_pool_return for CuArrays)
 # - Level 3: Full + borrow call-site registry + debug messages
 #
 # Key difference: CPU uses resize!(v, 0) at Level 1 to invalidate stale SubArrays.
 # On CUDA, resize!(CuVector, 0) would free GPU memory, so we use
-# _resize_without_shrink!(vec, 0) instead — sets dims to (0,) while preserving
+# _resize_to_fit!(vec, 0) instead — sets dims to (0,) while preserving
 # the GPU allocation (maxsize). Poisoning fills sentinel data before the shrink.
+# arr_wrappers are invalidated by setting wrapper dims to zeros (matches CPU pattern).
 
 using AdaptiveArrayPools: _safety_level, _validate_pool_return,
     _set_pending_callsite!, _maybe_record_borrow!,
-    _invalidate_released_slots!,
+    _invalidate_released_slots!, _zero_dims_tuple,
     _throw_pool_escape_error,
     POOL_DEBUG, POOL_SAFETY_LV,
     PoolRuntimeEscapeError
@@ -49,26 +50,29 @@ end
 #
 # Overrides the no-op fallback in base. On CUDA:
 # - Level 0: no-op (base _rewind_typed_pool! gates with S >= 1, so never called)
-# - Level 1+: poison released CuVectors + invalidate N-way view cache
-# - NO resize!(cuv, 0) — would free GPU memory
+# - Level 1+: poison released CuVectors + invalidate arr_wrappers
+# - NO resize!(cuv, 0) — would free GPU memory; use _resize_to_fit! instead
 
 @noinline function AdaptiveArrayPools._invalidate_released_slots!(
         tp::CuTypedPool{T}, old_n_active::Int, S::Int
     ) where {T}
     new_n = tp.n_active
+    # Poison released CuVectors + shrink logical length to 0
     for i in (new_n + 1):old_n_active
-        # Poison released CuVectors with sentinel values
         _cuda_poison_fill!(@inbounds tp.vectors[i])
-        # Shrink logical length to 0 (GPU memory preserved via _resize_without_shrink!).
+        # Shrink logical length to 0 (GPU memory preserved via _resize_to_fit!).
         # Matches CPU behavior where resize!(vec, 0) invalidates SubArray references.
-        _resize_without_shrink!(@inbounds(tp.vectors[i]), 0)
-        # Invalidate N-way cache entries for released slots.
-        # After poisoning, cached views point at poisoned data — clear them so
-        # re-acquire creates fresh views instead of returning stale poisoned ones.
-        base = (i - 1) * CACHE_WAYS
-        for k in 1:CACHE_WAYS
-            @inbounds tp.views[base + k] = nothing
-            @inbounds tp.view_dims[base + k] = nothing
+        _resize_to_fit!(@inbounds(tp.vectors[i]), 0)
+    end
+    # Invalidate arr_wrappers for released slots (matches CPU pattern from src/state.jl)
+    for N_idx in 1:length(tp.arr_wrappers)
+        wrappers_for_N = @inbounds tp.arr_wrappers[N_idx]
+        wrappers_for_N === nothing && continue
+        wrappers = wrappers_for_N::Vector{Any}
+        for i in (new_n + 1):min(old_n_active, length(wrappers))
+            wrapper = @inbounds wrappers[i]
+            wrapper === nothing && continue
+            setfield!(wrapper::CuArray, :dims, _zero_dims_tuple(N_idx))
         end
     end
     return nothing

diff --git a/ext/AdaptiveArrayPoolsCUDAExt/state.jl b/ext/AdaptiveArrayPoolsCUDAExt/state.jl
@@ -316,9 +316,7 @@ CUDA.reclaim()
 """
 function Base.empty!(tp::CuTypedPool)
     empty!(tp.vectors)
-    empty!(tp.views)
-    empty!(tp.view_dims)
-    empty!(tp.next_way)
+    empty!(tp.arr_wrappers)
     tp.n_active = 0
     # Restore sentinel values
     empty!(tp._checkpoint_n_active)

diff --git a/ext/AdaptiveArrayPoolsCUDAExt/types.jl b/ext/AdaptiveArrayPoolsCUDAExt/types.jl
@@ -3,44 +3,38 @@
 # ==============================================================================
 
 # Note: Unlike CPU, view(CuVector, 1:n) returns CuVector (via GPUArrays derive()),
-# NOT SubArray. However, we still cache view objects to avoid CPU heap allocation
-# (~80 bytes per call) for the CuVector metadata wrapper.
-
-# Note: Uses shared CACHE_WAYS constant from main module for consistency.
+# NOT SubArray. GPU view/reshape creation allocates ~80 bytes on CPU heap for the
+# CuArray wrapper. We cache wrappers via arr_wrappers to achieve zero-allocation
+# on cache hit (same approach as CPU's setfield!-based Array wrapper reuse).
 
 """
     CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}}
 
-GPU memory pool for element type `T`. Uses unified N-way view caching for all dimensions.
+GPU memory pool for element type `T`. Uses `arr_wrappers`-based CuArray reuse
+for zero-allocation acquire (same design as CPU TypedPool on Julia 1.11+).
 
 ## Fields
 - `vectors`: Backing `CuVector{T}` storage (one per slot)
-- `views`: Flat N-way cache storing CuArray of any dimension
-  - Layout: `views[(slot-1)*CACHE_WAYS + way]` for way ∈ 1:CACHE_WAYS
-- `view_dims`: Cached dims corresponding to views
-- `next_way`: Round-robin counter per slot for cache replacement
+- `arr_wrappers`: `Vector{Union{Nothing, Vector{Any}}}` — indexed by N (dimensionality),
+  each entry is a per-slot cached `CuArray{T,N}` wrapper. Uses `setfield!(wrapper, :dims, dims)`
+  for zero-allocation reuse of unlimited dimension patterns within the same N.
+  When the backing vector's GPU buffer changes (rare: only on grow beyond capacity),
+  the wrapper's `:data` field is updated via DataRef refcount management.
 - State management fields (same as CPU)
 
 ## Design Note
-Unlike CPU where view() returns SubArray and reshape() returns ReshapedArray,
-CUDA returns CuArray for both operations. This allows a unified cache that
-stores CuArray{T,N} for any N, eliminating the need for separate 1D/N-D caches.
-
-GPU view/reshape creation allocates ~80 bytes on CPU heap for the CuArray
-wrapper object. N-way caching with for-loop lookup eliminates this allocation
-when the same dimensions pattern is requested again.
+Unlike CPU where `setfield!(:ref, MemoryRef)` is free (GC-managed),
+CuArray's `:data` field is `DataRef` with manual refcounting. We minimize this cost
+via `wrapper.data.rc !== vec.data.rc` identity check (~2ns): only update `:data`
+when the backing vector's GPU buffer actually changed. The common case (same buffer)
+is a simple `setfield!(:dims)` — truly zero-allocation.
 """
 mutable struct CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}}
     # --- Storage ---
     vectors::Vector{CuVector{T}}
 
-    # --- Unified N-Way View Cache (flat layout) ---
-    # Length = n_slots * CACHE_WAYS
-    views::Vector{Any}       # CuArray{T,N} for any N
-    view_dims::Vector{Any}   # NTuple{N,Int} or nothing
-
-    # --- Cache Replacement (round-robin per slot) ---
-    next_way::Vector{Int}    # next_way[slot] ∈ 1:CACHE_WAYS
+    # --- N-D Wrapper Cache (setfield!-based reuse, matches CPU TypedPool) ---
+    arr_wrappers::Vector{Union{Nothing, Vector{Any}}}  # index=N (dimensionality), value=per-slot CuArray{T,N}
 
     # --- State Management (1-based sentinel pattern) ---
     n_active::Int
@@ -50,11 +44,9 @@ end
 
 function CuTypedPool{T}() where {T}
     return CuTypedPool{T}(
-        CuVector{T}[],      # vectors
-        Any[],              # views (N-way flat cache)
-        Any[],              # view_dims
-        Int[],              # next_way (round-robin counters)
-        0, [0], [0]         # State (1-based sentinel)
+        CuVector{T}[],                   # vectors
+        Union{Nothing, Vector{Any}}[],   # arr_wrappers (indexed by N)
+        0, [0], [0]                      # State (1-based sentinel)
     )
 end
 

diff --git a/src/legacy/state.jl b/src/legacy/state.jl
@@ -4,6 +4,17 @@
 # Identical to v1.11+ state.jl except empty! clears legacy N-way cache fields
 # (nd_arrays, nd_dims, nd_ptrs, nd_next_way) instead of nd_wrappers.
 
+# Zero-dims tuple for wrapper invalidation. Literal tuples for N ≤ 4 avoid
+# ntuple(_ -> 0, N) dynamic-dispatch allocation (runtime N → heterogeneous
+# return type → boxing). Falls back to ntuple for N > 4 (extremely rare).
+@inline function _zero_dims_tuple(N::Int)
+    N == 1 && return (0,)
+    N == 2 && return (0, 0)
+    N == 3 && return (0, 0, 0)
+    N == 4 && return (0, 0, 0, 0)
+    return ntuple(_ -> 0, N)
+end
+
 # ==============================================================================
 # State Management - checkpoint!
 # ==============================================================================
@@ -300,7 +311,7 @@ end
             dims === nothing && continue
             N = length(dims::Tuple)
             setfield!(ba::BitArray, :len, 0)
-            setfield!(ba::BitArray, :dims, ntuple(_ -> 0, N))
+            setfield!(ba::BitArray, :dims, _zero_dims_tuple(N))
         end
     end
     return nothing

diff --git a/src/state.jl b/src/state.jl
@@ -244,6 +244,17 @@ end
 _invalidate_released_slots!(::AbstractTypedPool, ::Int, ::Int) = nothing
 _invalidate_released_slots!(::AbstractTypedPool, ::Int) = nothing  # legacy 2-arg compat
 
+# Zero-dims tuple for wrapper invalidation. Literal tuples for N ≤ 4 avoid
+# ntuple(_ -> 0, N) dynamic-dispatch allocation (runtime N → heterogeneous
+# return type → boxing). Falls back to ntuple for N > 4 (extremely rare).
+@inline function _zero_dims_tuple(N::Int)
+    N == 1 && return (0,)
+    N == 2 && return (0, 0)
+    N == 3 && return (0, 0, 0)
+    N == 4 && return (0, 0, 0, 0)
+    return ntuple(_ -> 0, N)
+end
+
 @noinline function _invalidate_released_slots!(tp::TypedPool{T}, old_n_active::Int, S::Int = POOL_SAFETY_LV[]) where {T}
     new_n = tp.n_active
     # Level 2+: poison vectors with NaN/sentinel before structural invalidation
@@ -262,7 +273,7 @@ _invalidate_released_slots!(::AbstractTypedPool, ::Int) = nothing  # legacy 2-ar
         for i in (new_n + 1):min(old_n_active, length(wrappers))
             wrapper = @inbounds wrappers[i]
             wrapper === nothing && continue
-            setfield!(wrapper::Array, :size, ntuple(_ -> 0, N_idx))
+            setfield!(wrapper::Array, :size, _zero_dims_tuple(N_idx))
         end
     end
     return nothing
@@ -288,7 +299,7 @@ end
             wrapper === nothing && continue
             ba = wrapper::BitArray
             setfield!(ba, :len, 0)
-            setfield!(ba, :dims, ntuple(_ -> 0, N_idx))
+            setfield!(ba, :dims, _zero_dims_tuple(N_idx))
         end
     end
     return nothing

diff --git a/test/cuda/test_allocation.jl b/test/cuda/test_allocation.jl
@@ -169,53 +169,68 @@
 
 end
 
-@testset "_resize_without_shrink! GPU memory preservation" begin
-    _resize_without_shrink! = ext._resize_without_shrink!
+@testset "_resize_to_fit! GPU memory preservation" begin
+    _resize_to_fit! = ext._resize_to_fit!
 
     @testset "Shrink preserves GPU pointer" begin
         v = CUDA.zeros(Float32, 1000)
         ptr = UInt(pointer(v))
-        _resize_without_shrink!(v, 100)
+        _resize_to_fit!(v, 100)
         @test length(v) == 100
         @test UInt(pointer(v)) == ptr
     end
 
-    @testset "Grow-back within maxsize: no realloc" begin
+    @testset "Grow-back within capacity: no realloc" begin
         v = CUDA.zeros(Float32, 1000)
         ptr = UInt(pointer(v))
         # Shrink first
-        _resize_without_shrink!(v, 100)
+        _resize_to_fit!(v, 100)
         @test length(v) == 100
         @test UInt(pointer(v)) == ptr
         # Grow back to original size — maxsize preserved, so no GPU realloc
-        _resize_without_shrink!(v, 1000)
+        _resize_to_fit!(v, 1000)
         @test length(v) == 1000
         @test UInt(pointer(v)) == ptr
     end
 
-    @testset "Shrink to 0 preserves pointer" begin
+    @testset "Shrink to 0, grow back preserves pointer" begin
         v = CUDA.zeros(Float32, 500)
         ptr = UInt(pointer(v))
-        _resize_without_shrink!(v, 0)
+        _resize_to_fit!(v, 0)
         @test length(v) == 0
         # GPU memory still allocated (not freed)
-        # Grow back from 0
-        _resize_without_shrink!(v, 500)
+        # Grow back from 0 — within capacity, so no GPU realloc
+        _resize_to_fit!(v, 500)
         @test length(v) == 500
         @test UInt(pointer(v)) == ptr
     end
 
+    @testset "Grow within capacity after invalidation: no realloc" begin
+        # This is the key test: after safety invalidation (dims→0),
+        # re-acquire within original capacity should NOT trigger GPU realloc.
+        # (CUDA.jl v5.9.x resize! would always reallocate; _resize_to_fit! avoids this)
+        v = CUDA.zeros(Float32, 1000)
+        ptr = UInt(pointer(v))
+        # Simulate safety invalidation
+        _resize_to_fit!(v, 0)
+        @test length(v) == 0
+        # Re-acquire at smaller size (still within original capacity)
+        _resize_to_fit!(v, 200)
+        @test length(v) == 200
+        @test UInt(pointer(v)) == ptr  # Same GPU buffer
+    end
+
     @testset "No-op when n == length" begin
         v = CUDA.zeros(Float32, 200)
         ptr = UInt(pointer(v))
-        _resize_without_shrink!(v, 200)
+        _resize_to_fit!(v, 200)
         @test length(v) == 200
         @test UInt(pointer(v)) == ptr
     end
 
-    @testset "Grow beyond maxsize delegates to resize!" begin
+    @testset "Grow beyond capacity delegates to resize!" begin
         v = CUDA.zeros(Float32, 100)
-        _resize_without_shrink!(v, 10_000)
+        _resize_to_fit!(v, 10_000)
         @test length(v) == 10_000
         # Pointer may change (new allocation) — just verify length is correct
     end

diff --git a/test/cuda/test_cuda_safety.jl b/test/cuda/test_cuda_safety.jl
@@ -56,7 +56,7 @@ _cuda_test_leak(x) = x
     # ==============================================================================
     # Level 1: Poisoning + structural invalidation (length → 0)
     # ==============================================================================
-    # CUDA Level 1 now: poison fill → _resize_without_shrink!(vec, 0)
+    # CUDA Level 1 now: poison fill → _resize_to_fit!(vec, 0) + arr_wrappers invalidation
     # Backing vector length becomes 0 (GPU memory preserved via maxsize).
     # Poison data persists in GPU memory and is visible on re-acquire (grow-back).
 
@@ -141,17 +141,22 @@ _cuda_test_leak(x) = x
         rewind!(pool)
     end
 
-    @testset "Level 1: N-way cache invalidated on poisoned rewind" begin
+    @testset "Level 1: arr_wrappers invalidated on poisoned rewind" begin
         pool = _make_cuda_pool(1)
         checkpoint!(pool)
         v = acquire!(pool, Float32, 10)
         CUDA.fill!(v, 1.0f0)
         rewind!(pool)
 
-        # Cached views should be cleared (nothing) after poisoning
-        base = 0 * ext.CACHE_WAYS
-        for k in 1:ext.CACHE_WAYS
-            @test pool.float32.views[base + k] === nothing
+        # arr_wrappers for released slots should have zero-dims after invalidation
+        tp = pool.float32
+        for N_idx in 1:length(tp.arr_wrappers)
+            wrappers_for_N = tp.arr_wrappers[N_idx]
+            wrappers_for_N === nothing && continue
+            for wrapper in wrappers_for_N
+                wrapper === nothing && continue
+                @test all(==(0), size(wrapper))
+            end
         end
     end
 

diff --git a/test/cuda/test_extension.jl b/test/cuda/test_extension.jl
@@ -6,10 +6,8 @@
         tp_fields = fieldnames(CuTypedPool)
         @test :vectors in tp_fields
         @test :n_active in tp_fields
-        # N-way cache fields
-        @test :views in tp_fields
-        @test :view_dims in tp_fields
-        @test :next_way in tp_fields  # Round-robin counter
+        # arr_wrappers (setfield!-based wrapper reuse, replaces N-way cache)
+        @test :arr_wrappers in tp_fields
         # State management
         @test :_checkpoint_n_active in tp_fields
         @test :_checkpoint_depths in tp_fields