diff --git a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl index e6149ca1..ca1f6583 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl @@ -11,26 +11,63 @@ # Cache lookup uses simple for loop - measured overhead ~16 bytes (acceptable). # # ============================================================================== -# Memory Resize Strategy +# Memory Resize Strategy: _resize_without_shrink! # ============================================================================== -# Current: RESIZE TO FIT - backing vectors grow or shrink to match requested size. -# Same behavior as CPU version. +# GPU vs CPU difference: +# - CPU Vector: resize!(v, smaller) preserves capacity (pointer unchanged, cheap) +# - GPU CuVector: resize!(v, smaller) may reallocate when n < capacity÷4 +# (CUDA.jl's 25% threshold triggers pool_alloc + copy + pool_free) # -# GPU vs CPU difference (verified experimentally): -# - CPU Vector: resize!(v, smaller) preserves capacity (pointer unchanged) -# - GPU CuVector: resize!(v, smaller) may reallocate (CUDA.jl uses 25% threshold) -# However, CUDA memory pool often returns the same block on regrow. +# Problem: Pool operations frequently shrink backing vectors: +# - Safety invalidation: resize!(vec, 0) to invalidate released slots +# - Acquire path: resize!(vec, smaller_size) when reusing a slot for smaller array +# Both trigger expensive GPU reallocation via CUDA.jl's 25% threshold. # -# TODO: Potential future optimizations: -# - CUDA.jl's resize! already uses 25% threshold internally (no realloc if within capacity) -# - Could use even smaller threshold (e.g., 12.5%) to be more aggressive about shrinking -# - Could track recent N sizes to make smarter decisions (avoid shrink if sizes fluctuate) +# Solution: _resize_without_shrink!(A, n) +# - Grow (n > length): delegates to CUDA.jl resize! (may allocate more GPU memory) +# - Shrink (n < length): setfield!(A, :dims, (n,)) — logical size only, no GPU op +# - Equal (n == length): no-op +# +# Key property: maxsize is preserved on shrink. When later growing back, +# CUDA.jl computes cap = maxsize ÷ aligned_sizeof(T) and sees n ≤ cap, +# so no reallocation occurs. This is ideal for pool's borrow/return pattern. +# +# ⚠ Depends on CuArray internal fields (:dims, .maxsize). Tested with CUDA.jl v5.x. # ============================================================================== using AdaptiveArrayPools: get_view!, get_array!, allocate_vector, safe_prod, _record_type_touch!, _fixed_slot_bit, _checkpoint_typed_pool!, _MODE_BITS_MASK +# Guard against CUDA.jl internal API changes (tested with v5.x). +# setfield!(:dims) requires CuArray to be mutable and have a :dims field. +@static if !(ismutable(CuArray) && hasfield(CuArray, :dims)) + error("Unsupported CUDA.jl version: expected mutable CuArray with field :dims. _resize_without_shrink! needs updating.") +end + +""" + _resize_without_shrink!(A::CuVector{T}, n::Integer) -> CuVector{T} + +Resize a CuVector's logical length without freeing GPU memory on shrink. + +- `n > length(A)`: delegates to `resize!(A, n)` (may grow GPU allocation) +- `n == length(A)`: no-op +- `n < length(A)`: only updates `dims` field (GPU memory preserved at `maxsize`) + +Avoids CUDA.jl's 25% threshold reallocation on shrink (`n < cap÷4` triggers +`pool_alloc` + `unsafe_copyto!` + `pool_free`), which is expensive for pool +operations like safety invalidation (`resize!(v, 0)`) and acquire-path resizing. +""" +@inline function _resize_without_shrink!(A::CuVector{T}, n::Integer) where {T} + current = length(A) + if n > current + resize!(A, n) # grow: delegate to CUDA.jl + elseif n < current + setfield!(A, :dims, (Int(n),)) # shrink: dims only, GPU memory preserved + end + return A +end + """ get_view!(tp::CuTypedPool{T}, n::Int) -> CuVector{T} @@ -59,8 +96,9 @@ Returns cached view on hit (near-zero CPU allocation), creates new on miss. - N-way caching eliminates this allocation on cache hit ## Memory Resize Strategy -Backing vectors are resized to match requested size (grow or shrink). -See module header for "lazy shrink" optimization notes. +Backing vectors use `_resize_without_shrink!`: grow delegates to CUDA.jl's +`resize!` (may reallocate), shrink only updates `dims` (GPU memory preserved). +See module header for details. """ @inline function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N} tp.n_active += 1 @@ -110,12 +148,12 @@ See module header for "lazy shrink" optimization notes. @inbounds vec = tp.vectors[idx] current_len = length(vec) if current_len != total_len - # Resize vector to match requested size (grow or shrink) - # Note: CUDA.jl's resize! internally uses 25% threshold - won't reallocate - # unless new size exceeds capacity or is <25% of capacity. - resize!(vec, total_len) - # CRITICAL: resize! may reallocate the GPU buffer (pointer change). - # All cached views for this slot now reference the OLD buffer. + # Resize vector to match requested size (grow or shrink). + # Uses _resize_without_shrink! to avoid GPU reallocation on shrink. + _resize_without_shrink!(vec, total_len) + # CRITICAL: on grow, _resize_without_shrink! delegates to resize! which + # may reallocate the GPU buffer (pointer change). On shrink, pointer is + # stable but length changed. Either way, cached views are stale. # Must invalidate ALL ways to prevent returning stale/dangling views. for k in 1:CACHE_WAYS @inbounds tp.views[base + k] = nothing diff --git a/ext/AdaptiveArrayPoolsCUDAExt/debug.jl b/ext/AdaptiveArrayPoolsCUDAExt/debug.jl index 2ac371f8..6dbb6a92 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/debug.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/debug.jl @@ -5,13 +5,15 @@ # # Safety levels on CUDA differ from CPU: # - Level 0: Zero overhead (all branches dead-code-eliminated) -# - Level 1: Poisoning (NaN/sentinel fill) + N-way cache invalidation -# (CUDA equivalent of CPU's resize!/setfield! structural invalidation) +# - Level 1: Poisoning (NaN/sentinel fill) + structural invalidation via +# _resize_without_shrink!(vec, 0) + N-way cache invalidation # - Level 2: Poisoning + escape detection (_validate_pool_return for CuArrays) # - Level 3: Full + borrow call-site registry + debug messages # # Key difference: CPU uses resize!(v, 0) at Level 1 to invalidate stale SubArrays. -# On CUDA, resize!(CuVector, 0) frees GPU memory, so we use poisoning instead. +# On CUDA, resize!(CuVector, 0) would free GPU memory, so we use +# _resize_without_shrink!(vec, 0) instead — sets dims to (0,) while preserving +# the GPU allocation (maxsize). Poisoning fills sentinel data before the shrink. using AdaptiveArrayPools: _safety_level, _validate_pool_return, _set_pending_callsite!, _maybe_record_borrow!, @@ -57,6 +59,9 @@ end for i in (new_n + 1):old_n_active # Poison released CuVectors with sentinel values _cuda_poison_fill!(@inbounds tp.vectors[i]) + # Shrink logical length to 0 (GPU memory preserved via _resize_without_shrink!). + # Matches CPU behavior where resize!(vec, 0) invalidates SubArray references. + _resize_without_shrink!(@inbounds(tp.vectors[i]), 0) # Invalidate N-way cache entries for released slots. # After poisoning, cached views point at poisoned data — clear them so # re-acquire creates fresh views instead of returning stale poisoned ones. diff --git a/test/cuda/test_allocation.jl b/test/cuda/test_allocation.jl index b970e25b..7a5ff1b1 100644 --- a/test/cuda/test_allocation.jl +++ b/test/cuda/test_allocation.jl @@ -169,6 +169,58 @@ end +@testset "_resize_without_shrink! GPU memory preservation" begin + _resize_without_shrink! = ext._resize_without_shrink! + + @testset "Shrink preserves GPU pointer" begin + v = CUDA.zeros(Float32, 1000) + ptr = UInt(pointer(v)) + _resize_without_shrink!(v, 100) + @test length(v) == 100 + @test UInt(pointer(v)) == ptr + end + + @testset "Grow-back within maxsize: no realloc" begin + v = CUDA.zeros(Float32, 1000) + ptr = UInt(pointer(v)) + # Shrink first + _resize_without_shrink!(v, 100) + @test length(v) == 100 + @test UInt(pointer(v)) == ptr + # Grow back to original size — maxsize preserved, so no GPU realloc + _resize_without_shrink!(v, 1000) + @test length(v) == 1000 + @test UInt(pointer(v)) == ptr + end + + @testset "Shrink to 0 preserves pointer" begin + v = CUDA.zeros(Float32, 500) + ptr = UInt(pointer(v)) + _resize_without_shrink!(v, 0) + @test length(v) == 0 + # GPU memory still allocated (not freed) + # Grow back from 0 + _resize_without_shrink!(v, 500) + @test length(v) == 500 + @test UInt(pointer(v)) == ptr + end + + @testset "No-op when n == length" begin + v = CUDA.zeros(Float32, 200) + ptr = UInt(pointer(v)) + _resize_without_shrink!(v, 200) + @test length(v) == 200 + @test UInt(pointer(v)) == ptr + end + + @testset "Grow beyond maxsize delegates to resize!" begin + v = CUDA.zeros(Float32, 100) + _resize_without_shrink!(v, 10_000) + @test length(v) == 10_000 + # Pointer may change (new allocation) — just verify length is correct + end +end + @testset "CPU Allocation (CuArray wrapper)" begin @testset "acquire! N-D has low CPU allocation (cache hit)" begin diff --git a/test/cuda/test_cuda_safety.jl b/test/cuda/test_cuda_safety.jl index a5fbc5d4..a91e5d47 100644 --- a/test/cuda/test_cuda_safety.jl +++ b/test/cuda/test_cuda_safety.jl @@ -54,8 +54,22 @@ _cuda_test_leak(x) = x end # ============================================================================== - # Level 1: Poisoning (CUDA equivalent of CPU's resize! invalidation) + # Level 1: Poisoning + structural invalidation (length → 0) # ============================================================================== + # CUDA Level 1 now: poison fill → _resize_without_shrink!(vec, 0) + # Backing vector length becomes 0 (GPU memory preserved via maxsize). + # Poison data persists in GPU memory and is visible on re-acquire (grow-back). + + @testset "Level 1: released vectors have length 0 after rewind" begin + pool = _make_cuda_pool(1) + checkpoint!(pool) + v = acquire!(pool, Float32, 100) + CUDA.fill!(v, 42.0f0) + rewind!(pool) + + # Structural invalidation: length → 0 (matches CPU behavior) + @test length(pool.float32.vectors[1]) == 0 + end @testset "Level 1: Float32 poisoned with NaN on rewind" begin pool = _make_cuda_pool(1) @@ -64,12 +78,10 @@ _cuda_test_leak(x) = x CUDA.fill!(v, 42.0f0) rewind!(pool) - # Backing vector should be poisoned with NaN (NOT freed via resize!) - @test length(pool.float32.vectors[1]) >= 10 - cpu_data = Array(pool.float32.vectors[1]) - @test all(isnan, cpu_data[1:10]) + # Backing vector length is 0 after invalidation + @test length(pool.float32.vectors[1]) == 0 - # Re-acquire: should see poisoned data + # Re-acquire: grow-back reuses same GPU memory → poison data visible checkpoint!(pool) v2 = acquire!(pool, Float32, 10) @test all(isnan, Array(v2)) @@ -83,8 +95,11 @@ _cuda_test_leak(x) = x CUDA.fill!(v, Int32(42)) rewind!(pool) - cpu_data = Array(pool.int32.vectors[1]) - @test all(==(typemax(Int32)), cpu_data[1:8]) + # Verify via re-acquire (backing vector length is 0 after invalidation) + checkpoint!(pool) + v2 = acquire!(pool, Int32, 8) + @test all(==(typemax(Int32)), Array(v2)) + rewind!(pool) end @testset "Level 1: ComplexF32 poisoned with NaN on rewind" begin @@ -94,8 +109,10 @@ _cuda_test_leak(x) = x CUDA.fill!(v, ComplexF32(1.0f0 + 2.0f0im)) rewind!(pool) - cpu_data = Array(pool.complexf32.vectors[1]) - @test all(z -> isnan(real(z)) && isnan(imag(z)), cpu_data[1:8]) + checkpoint!(pool) + v2 = acquire!(pool, ComplexF32, 8) + @test all(z -> isnan(real(z)) && isnan(imag(z)), Array(v2)) + rewind!(pool) end @testset "Level 1: Bool poisoned with true on rewind" begin @@ -105,8 +122,10 @@ _cuda_test_leak(x) = x CUDA.fill!(v, false) rewind!(pool) - cpu_data = Array(pool.bool.vectors[1]) - @test all(==(true), cpu_data[1:16]) + checkpoint!(pool) + v2 = acquire!(pool, Bool, 16) + @test all(==(true), Array(v2)) + rewind!(pool) end @testset "Level 1: Float16 poisoned with NaN on rewind" begin @@ -116,8 +135,10 @@ _cuda_test_leak(x) = x CUDA.fill!(v, Float16(42.0)) rewind!(pool) - cpu_data = Array(pool.float16.vectors[1]) - @test all(isnan, cpu_data[1:10]) + checkpoint!(pool) + v2 = acquire!(pool, Float16, 10) + @test all(isnan, Array(v2)) + rewind!(pool) end @testset "Level 1: N-way cache invalidated on poisoned rewind" begin @@ -330,18 +351,26 @@ _cuda_test_leak(x) = x CUDA.fill!(v_inner, 2.0f0) rewind!(pool) - # Inner should be poisoned (slot 2 released) - cpu_inner = Array(pool.float32.vectors[2]) - @test all(isnan, cpu_inner[1:20]) + # Inner should be invalidated (slot 2: length → 0, poisoned) + @test length(pool.float32.vectors[2]) == 0 + # Verify poison via re-acquire + checkpoint!(pool) + v_inner2 = acquire!(pool, Float32, 20) + @test all(isnan, Array(v_inner2)) + rewind!(pool) # Outer should still be valid (slot 1 not released) cpu_outer = Array(v_outer) @test all(x -> x == 1.0f0, cpu_outer) rewind!(pool) - # Now outer is also poisoned - cpu_outer_after = Array(pool.float32.vectors[1]) - @test all(isnan, cpu_outer_after[1:10]) + # Now outer is also invalidated (length → 0, poisoned) + @test length(pool.float32.vectors[1]) == 0 + # Verify poison via re-acquire + checkpoint!(pool) + v_outer2 = acquire!(pool, Float32, 10) + @test all(isnan, Array(v_outer2)) + rewind!(pool) end # ============================================================================== @@ -410,9 +439,15 @@ _cuda_test_leak(x) = x CUDA.fill!(v, UInt8(42)) rewind!(pool) + # Backing vector length → 0 after invalidation tp = pool.others[UInt8] - cpu_data = Array(tp.vectors[1]) - @test all(==(typemax(UInt8)), cpu_data[1:16]) + @test length(tp.vectors[1]) == 0 + + # Verify poison via re-acquire + checkpoint!(pool) + v2 = acquire!(pool, UInt8, 16) + @test all(==(typemax(UInt8)), Array(v2)) + rewind!(pool) end # ==============================================================================