ProjectTorreyPines · mgyoo86 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl
@@ -11,26 +11,63 @@
 # Cache lookup uses simple for loop - measured overhead ~16 bytes (acceptable).
 #
 # ==============================================================================
-# Memory Resize Strategy
+# Memory Resize Strategy: _resize_without_shrink!
 # ==============================================================================
-# Current: RESIZE TO FIT - backing vectors grow or shrink to match requested size.
-# Same behavior as CPU version.
+# GPU vs CPU difference:
+#   - CPU Vector: resize!(v, smaller) preserves capacity (pointer unchanged, cheap)
+#   - GPU CuVector: resize!(v, smaller) may reallocate when n < capacity÷4
+#     (CUDA.jl's 25% threshold triggers pool_alloc + copy + pool_free)
 #
-# GPU vs CPU difference (verified experimentally):
-#   - CPU Vector: resize!(v, smaller) preserves capacity (pointer unchanged)
-#   - GPU CuVector: resize!(v, smaller) may reallocate (CUDA.jl uses 25% threshold)
-#     However, CUDA memory pool often returns the same block on regrow.
+# Problem: Pool operations frequently shrink backing vectors:
+#   - Safety invalidation: resize!(vec, 0) to invalidate released slots
+#   - Acquire path: resize!(vec, smaller_size) when reusing a slot for smaller array
+#   Both trigger expensive GPU reallocation via CUDA.jl's 25% threshold.
 #
-# TODO: Potential future optimizations:
-#   - CUDA.jl's resize! already uses 25% threshold internally (no realloc if within capacity)
-#   - Could use even smaller threshold (e.g., 12.5%) to be more aggressive about shrinking
-#   - Could track recent N sizes to make smarter decisions (avoid shrink if sizes fluctuate)
+# Solution: _resize_without_shrink!(A, n)
+#   - Grow (n > length): delegates to CUDA.jl resize! (may allocate more GPU memory)
+#   - Shrink (n < length): setfield!(A, :dims, (n,)) — logical size only, no GPU op
+#   - Equal (n == length): no-op
+#
+# Key property: maxsize is preserved on shrink. When later growing back,
+# CUDA.jl computes cap = maxsize ÷ aligned_sizeof(T) and sees n ≤ cap,
+# so no reallocation occurs. This is ideal for pool's borrow/return pattern.
+#
+# ⚠ Depends on CuArray internal fields (:dims, .maxsize). Tested with CUDA.jl v5.x.
 # ==============================================================================
 
 using AdaptiveArrayPools: get_view!, get_array!, allocate_vector, safe_prod,
     _record_type_touch!, _fixed_slot_bit, _checkpoint_typed_pool!,
     _MODE_BITS_MASK
 
+# Guard against CUDA.jl internal API changes (tested with v5.x).
+# setfield!(:dims) requires CuArray to be mutable and have a :dims field.
+@static if !(ismutable(CuArray) && hasfield(CuArray, :dims))
+    error("Unsupported CUDA.jl version: expected mutable CuArray with field :dims. _resize_without_shrink! needs updating.")
+end
+
+"""
+    _resize_without_shrink!(A::CuVector{T}, n::Integer) -> CuVector{T}
+
+Resize a CuVector's logical length without freeing GPU memory on shrink.
+
+- `n > length(A)`: delegates to `resize!(A, n)` (may grow GPU allocation)
+- `n == length(A)`: no-op
+- `n < length(A)`: only updates `dims` field (GPU memory preserved at `maxsize`)
+
+Avoids CUDA.jl's 25% threshold reallocation on shrink (`n < cap÷4` triggers
+`pool_alloc` + `unsafe_copyto!` + `pool_free`), which is expensive for pool
+operations like safety invalidation (`resize!(v, 0)`) and acquire-path resizing.
+"""
+@inline function _resize_without_shrink!(A::CuVector{T}, n::Integer) where {T}
+    current = length(A)
+    if n > current
+        resize!(A, n)                       # grow: delegate to CUDA.jl
+    elseif n < current
+        setfield!(A, :dims, (Int(n),))      # shrink: dims only, GPU memory preserved
+    end
+    return A
+end
+
 """
     get_view!(tp::CuTypedPool{T}, n::Int) -> CuVector{T}
 
@@ -59,8 +96,9 @@ Returns cached view on hit (near-zero CPU allocation), creates new on miss.
 - N-way caching eliminates this allocation on cache hit
 
 ## Memory Resize Strategy
-Backing vectors are resized to match requested size (grow or shrink).
-See module header for "lazy shrink" optimization notes.
+Backing vectors use `_resize_without_shrink!`: grow delegates to CUDA.jl's
+`resize!` (may reallocate), shrink only updates `dims` (GPU memory preserved).
+See module header for details.
 """
 @inline function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N}
     tp.n_active += 1
@@ -110,12 +148,12 @@ See module header for "lazy shrink" optimization notes.
     @inbounds vec = tp.vectors[idx]
     current_len = length(vec)
     if current_len != total_len
-        # Resize vector to match requested size (grow or shrink)
-        # Note: CUDA.jl's resize! internally uses 25% threshold - won't reallocate
-        #       unless new size exceeds capacity or is <25% of capacity.
-        resize!(vec, total_len)
-        # CRITICAL: resize! may reallocate the GPU buffer (pointer change).
-        # All cached views for this slot now reference the OLD buffer.
+        # Resize vector to match requested size (grow or shrink).
+        # Uses _resize_without_shrink! to avoid GPU reallocation on shrink.
+        _resize_without_shrink!(vec, total_len)
+        # CRITICAL: on grow, _resize_without_shrink! delegates to resize! which
+        # may reallocate the GPU buffer (pointer change). On shrink, pointer is
+        # stable but length changed. Either way, cached views are stale.
         # Must invalidate ALL ways to prevent returning stale/dangling views.
         for k in 1:CACHE_WAYS
             @inbounds tp.views[base + k] = nothing

diff --git a/ext/AdaptiveArrayPoolsCUDAExt/debug.jl b/ext/AdaptiveArrayPoolsCUDAExt/debug.jl
@@ -5,13 +5,15 @@
 #
 # Safety levels on CUDA differ from CPU:
 # - Level 0: Zero overhead (all branches dead-code-eliminated)
-# - Level 1: Poisoning (NaN/sentinel fill) + N-way cache invalidation
-#            (CUDA equivalent of CPU's resize!/setfield! structural invalidation)
+# - Level 1: Poisoning (NaN/sentinel fill) + structural invalidation via
+#            _resize_without_shrink!(vec, 0) + N-way cache invalidation
 # - Level 2: Poisoning + escape detection (_validate_pool_return for CuArrays)
 # - Level 3: Full + borrow call-site registry + debug messages
 #
 # Key difference: CPU uses resize!(v, 0) at Level 1 to invalidate stale SubArrays.
-# On CUDA, resize!(CuVector, 0) frees GPU memory, so we use poisoning instead.
+# On CUDA, resize!(CuVector, 0) would free GPU memory, so we use
+# _resize_without_shrink!(vec, 0) instead — sets dims to (0,) while preserving
+# the GPU allocation (maxsize). Poisoning fills sentinel data before the shrink.
 
 using AdaptiveArrayPools: _safety_level, _validate_pool_return,
     _set_pending_callsite!, _maybe_record_borrow!,
@@ -57,6 +59,9 @@ end
     for i in (new_n + 1):old_n_active
         # Poison released CuVectors with sentinel values
         _cuda_poison_fill!(@inbounds tp.vectors[i])
+        # Shrink logical length to 0 (GPU memory preserved via _resize_without_shrink!).
+        # Matches CPU behavior where resize!(vec, 0) invalidates SubArray references.
+        _resize_without_shrink!(@inbounds(tp.vectors[i]), 0)
         # Invalidate N-way cache entries for released slots.
         # After poisoning, cached views point at poisoned data — clear them so
         # re-acquire creates fresh views instead of returning stale poisoned ones.

diff --git a/test/cuda/test_allocation.jl b/test/cuda/test_allocation.jl
@@ -169,6 +169,58 @@
 
 end
 
+@testset "_resize_without_shrink! GPU memory preservation" begin
+    _resize_without_shrink! = ext._resize_without_shrink!
+
+    @testset "Shrink preserves GPU pointer" begin
+        v = CUDA.zeros(Float32, 1000)
+        ptr = UInt(pointer(v))
+        _resize_without_shrink!(v, 100)
+        @test length(v) == 100
+        @test UInt(pointer(v)) == ptr
+    end
+
+    @testset "Grow-back within maxsize: no realloc" begin
+        v = CUDA.zeros(Float32, 1000)
+        ptr = UInt(pointer(v))
+        # Shrink first
+        _resize_without_shrink!(v, 100)
+        @test length(v) == 100
+        @test UInt(pointer(v)) == ptr
+        # Grow back to original size — maxsize preserved, so no GPU realloc
+        _resize_without_shrink!(v, 1000)
+        @test length(v) == 1000
+        @test UInt(pointer(v)) == ptr
+    end
+
+    @testset "Shrink to 0 preserves pointer" begin
+        v = CUDA.zeros(Float32, 500)
+        ptr = UInt(pointer(v))
+        _resize_without_shrink!(v, 0)
+        @test length(v) == 0
+        # GPU memory still allocated (not freed)
+        # Grow back from 0
+        _resize_without_shrink!(v, 500)
+        @test length(v) == 500
+        @test UInt(pointer(v)) == ptr
+    end
+
+    @testset "No-op when n == length" begin
+        v = CUDA.zeros(Float32, 200)
+        ptr = UInt(pointer(v))
+        _resize_without_shrink!(v, 200)
+        @test length(v) == 200
+        @test UInt(pointer(v)) == ptr
+    end
+
+    @testset "Grow beyond maxsize delegates to resize!" begin
+        v = CUDA.zeros(Float32, 100)
+        _resize_without_shrink!(v, 10_000)
+        @test length(v) == 10_000
+        # Pointer may change (new allocation) — just verify length is correct
+    end
+end
+
 @testset "CPU Allocation (CuArray wrapper)" begin
 
     @testset "acquire! N-D has low CPU allocation (cache hit)" begin

diff --git a/test/cuda/test_cuda_safety.jl b/test/cuda/test_cuda_safety.jl
@@ -54,8 +54,22 @@ _cuda_test_leak(x) = x
     end
 
     # ==============================================================================
-    # Level 1: Poisoning (CUDA equivalent of CPU's resize! invalidation)
+    # Level 1: Poisoning + structural invalidation (length → 0)
     # ==============================================================================
+    # CUDA Level 1 now: poison fill → _resize_without_shrink!(vec, 0)
+    # Backing vector length becomes 0 (GPU memory preserved via maxsize).
+    # Poison data persists in GPU memory and is visible on re-acquire (grow-back).
+
+    @testset "Level 1: released vectors have length 0 after rewind" begin
+        pool = _make_cuda_pool(1)
+        checkpoint!(pool)
+        v = acquire!(pool, Float32, 100)
+        CUDA.fill!(v, 42.0f0)
+        rewind!(pool)
+
+        # Structural invalidation: length → 0 (matches CPU behavior)
+        @test length(pool.float32.vectors[1]) == 0
+    end
 
     @testset "Level 1: Float32 poisoned with NaN on rewind" begin
         pool = _make_cuda_pool(1)
@@ -64,12 +78,10 @@ _cuda_test_leak(x) = x
         CUDA.fill!(v, 42.0f0)
         rewind!(pool)
 
-        # Backing vector should be poisoned with NaN (NOT freed via resize!)
-        @test length(pool.float32.vectors[1]) >= 10
-        cpu_data = Array(pool.float32.vectors[1])
-        @test all(isnan, cpu_data[1:10])
+        # Backing vector length is 0 after invalidation
+        @test length(pool.float32.vectors[1]) == 0
 
-        # Re-acquire: should see poisoned data
+        # Re-acquire: grow-back reuses same GPU memory → poison data visible
         checkpoint!(pool)
         v2 = acquire!(pool, Float32, 10)
         @test all(isnan, Array(v2))
@@ -83,8 +95,11 @@ _cuda_test_leak(x) = x
         CUDA.fill!(v, Int32(42))
         rewind!(pool)
 
-        cpu_data = Array(pool.int32.vectors[1])
-        @test all(==(typemax(Int32)), cpu_data[1:8])
+        # Verify via re-acquire (backing vector length is 0 after invalidation)
+        checkpoint!(pool)
+        v2 = acquire!(pool, Int32, 8)
+        @test all(==(typemax(Int32)), Array(v2))
+        rewind!(pool)
     end
 
     @testset "Level 1: ComplexF32 poisoned with NaN on rewind" begin
@@ -94,8 +109,10 @@ _cuda_test_leak(x) = x
         CUDA.fill!(v, ComplexF32(1.0f0 + 2.0f0im))
         rewind!(pool)
 
-        cpu_data = Array(pool.complexf32.vectors[1])
-        @test all(z -> isnan(real(z)) && isnan(imag(z)), cpu_data[1:8])
+        checkpoint!(pool)
+        v2 = acquire!(pool, ComplexF32, 8)
+        @test all(z -> isnan(real(z)) && isnan(imag(z)), Array(v2))
+        rewind!(pool)
     end
 
     @testset "Level 1: Bool poisoned with true on rewind" begin
@@ -105,8 +122,10 @@ _cuda_test_leak(x) = x
         CUDA.fill!(v, false)
         rewind!(pool)
 
-        cpu_data = Array(pool.bool.vectors[1])
-        @test all(==(true), cpu_data[1:16])
+        checkpoint!(pool)
+        v2 = acquire!(pool, Bool, 16)
+        @test all(==(true), Array(v2))
+        rewind!(pool)
     end
 
     @testset "Level 1: Float16 poisoned with NaN on rewind" begin
@@ -116,8 +135,10 @@ _cuda_test_leak(x) = x
         CUDA.fill!(v, Float16(42.0))
         rewind!(pool)
 
-        cpu_data = Array(pool.float16.vectors[1])
-        @test all(isnan, cpu_data[1:10])
+        checkpoint!(pool)
+        v2 = acquire!(pool, Float16, 10)
+        @test all(isnan, Array(v2))
+        rewind!(pool)
     end
 
     @testset "Level 1: N-way cache invalidated on poisoned rewind" begin
@@ -330,18 +351,26 @@ _cuda_test_leak(x) = x
         CUDA.fill!(v_inner, 2.0f0)
         rewind!(pool)
 
-        # Inner should be poisoned (slot 2 released)
-        cpu_inner = Array(pool.float32.vectors[2])
-        @test all(isnan, cpu_inner[1:20])
+        # Inner should be invalidated (slot 2: length → 0, poisoned)
+        @test length(pool.float32.vectors[2]) == 0
+        # Verify poison via re-acquire
+        checkpoint!(pool)
+        v_inner2 = acquire!(pool, Float32, 20)
+        @test all(isnan, Array(v_inner2))
+        rewind!(pool)
 
         # Outer should still be valid (slot 1 not released)
         cpu_outer = Array(v_outer)
         @test all(x -> x == 1.0f0, cpu_outer)
 
         rewind!(pool)
-        # Now outer is also poisoned
-        cpu_outer_after = Array(pool.float32.vectors[1])
-        @test all(isnan, cpu_outer_after[1:10])
+        # Now outer is also invalidated (length → 0, poisoned)
+        @test length(pool.float32.vectors[1]) == 0
+        # Verify poison via re-acquire
+        checkpoint!(pool)
+        v_outer2 = acquire!(pool, Float32, 10)
+        @test all(isnan, Array(v_outer2))
+        rewind!(pool)
     end
 
     # ==============================================================================
@@ -410,9 +439,15 @@ _cuda_test_leak(x) = x
         CUDA.fill!(v, UInt8(42))
         rewind!(pool)
 
+        # Backing vector length → 0 after invalidation
         tp = pool.others[UInt8]
-        cpu_data = Array(tp.vectors[1])
-        @test all(==(typemax(UInt8)), cpu_data[1:16])
+        @test length(tp.vectors[1]) == 0
+
+        # Verify poison via re-acquire
+        checkpoint!(pool)
+        v2 = acquire!(pool, UInt8, 16)
+        @test all(==(typemax(UInt8)), Array(v2))
+        rewind!(pool)
     end
 
     # ==============================================================================