Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 57 additions & 19 deletions ext/AdaptiveArrayPoolsCUDAExt/acquire.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,26 +11,63 @@
# Cache lookup uses simple for loop - measured overhead ~16 bytes (acceptable).
#
# ==============================================================================
# Memory Resize Strategy
# Memory Resize Strategy: _resize_without_shrink!
# ==============================================================================
# Current: RESIZE TO FIT - backing vectors grow or shrink to match requested size.
# Same behavior as CPU version.
# GPU vs CPU difference:
# - CPU Vector: resize!(v, smaller) preserves capacity (pointer unchanged, cheap)
# - GPU CuVector: resize!(v, smaller) may reallocate when n < capacity÷4
# (CUDA.jl's 25% threshold triggers pool_alloc + copy + pool_free)
#
# GPU vs CPU difference (verified experimentally):
# - CPU Vector: resize!(v, smaller) preserves capacity (pointer unchanged)
# - GPU CuVector: resize!(v, smaller) may reallocate (CUDA.jl uses 25% threshold)
# However, CUDA memory pool often returns the same block on regrow.
# Problem: Pool operations frequently shrink backing vectors:
# - Safety invalidation: resize!(vec, 0) to invalidate released slots
# - Acquire path: resize!(vec, smaller_size) when reusing a slot for smaller array
# Both trigger expensive GPU reallocation via CUDA.jl's 25% threshold.
#
# TODO: Potential future optimizations:
# - CUDA.jl's resize! already uses 25% threshold internally (no realloc if within capacity)
# - Could use even smaller threshold (e.g., 12.5%) to be more aggressive about shrinking
# - Could track recent N sizes to make smarter decisions (avoid shrink if sizes fluctuate)
# Solution: _resize_without_shrink!(A, n)
# - Grow (n > length): delegates to CUDA.jl resize! (may allocate more GPU memory)
# - Shrink (n < length): setfield!(A, :dims, (n,)) — logical size only, no GPU op
# - Equal (n == length): no-op
#
# Key property: maxsize is preserved on shrink. When later growing back,
# CUDA.jl computes cap = maxsize ÷ aligned_sizeof(T) and sees n ≤ cap,
# so no reallocation occurs. This is ideal for pool's borrow/return pattern.
#
# ⚠ Depends on CuArray internal fields (:dims, .maxsize). Tested with CUDA.jl v5.x.
# ==============================================================================

using AdaptiveArrayPools: get_view!, get_array!, allocate_vector, safe_prod,
_record_type_touch!, _fixed_slot_bit, _checkpoint_typed_pool!,
_MODE_BITS_MASK

# Guard against CUDA.jl internal API changes (tested with v5.x).
# setfield!(:dims) requires CuArray to be mutable and have a :dims field.
@static if !(ismutable(CuArray) && hasfield(CuArray, :dims))
error("Unsupported CUDA.jl version: expected mutable CuArray with field :dims. _resize_without_shrink! needs updating.")
end

"""
_resize_without_shrink!(A::CuVector{T}, n::Integer) -> CuVector{T}

Resize a CuVector's logical length without freeing GPU memory on shrink.

- `n > length(A)`: delegates to `resize!(A, n)` (may grow GPU allocation)
- `n == length(A)`: no-op
- `n < length(A)`: only updates `dims` field (GPU memory preserved at `maxsize`)

Avoids CUDA.jl's 25% threshold reallocation on shrink (`n < cap÷4` triggers
`pool_alloc` + `unsafe_copyto!` + `pool_free`), which is expensive for pool
operations like safety invalidation (`resize!(v, 0)`) and acquire-path resizing.
"""
@inline function _resize_without_shrink!(A::CuVector{T}, n::Integer) where {T}
current = length(A)
if n > current
resize!(A, n) # grow: delegate to CUDA.jl
elseif n < current
setfield!(A, :dims, (Int(n),)) # shrink: dims only, GPU memory preserved
end
return A
end

"""
get_view!(tp::CuTypedPool{T}, n::Int) -> CuVector{T}

Expand Down Expand Up @@ -59,8 +96,9 @@ Returns cached view on hit (near-zero CPU allocation), creates new on miss.
- N-way caching eliminates this allocation on cache hit

## Memory Resize Strategy
Backing vectors are resized to match requested size (grow or shrink).
See module header for "lazy shrink" optimization notes.
Backing vectors use `_resize_without_shrink!`: grow delegates to CUDA.jl's
`resize!` (may reallocate), shrink only updates `dims` (GPU memory preserved).
See module header for details.
"""
@inline function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N}
tp.n_active += 1
Expand Down Expand Up @@ -110,12 +148,12 @@ See module header for "lazy shrink" optimization notes.
@inbounds vec = tp.vectors[idx]
current_len = length(vec)
if current_len != total_len
# Resize vector to match requested size (grow or shrink)
# Note: CUDA.jl's resize! internally uses 25% threshold - won't reallocate
# unless new size exceeds capacity or is <25% of capacity.
resize!(vec, total_len)
# CRITICAL: resize! may reallocate the GPU buffer (pointer change).
# All cached views for this slot now reference the OLD buffer.
# Resize vector to match requested size (grow or shrink).
# Uses _resize_without_shrink! to avoid GPU reallocation on shrink.
_resize_without_shrink!(vec, total_len)
# CRITICAL: on grow, _resize_without_shrink! delegates to resize! which
# may reallocate the GPU buffer (pointer change). On shrink, pointer is
# stable but length changed. Either way, cached views are stale.
# Must invalidate ALL ways to prevent returning stale/dangling views.
for k in 1:CACHE_WAYS
@inbounds tp.views[base + k] = nothing
Expand Down
11 changes: 8 additions & 3 deletions ext/AdaptiveArrayPoolsCUDAExt/debug.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
#
# Safety levels on CUDA differ from CPU:
# - Level 0: Zero overhead (all branches dead-code-eliminated)
# - Level 1: Poisoning (NaN/sentinel fill) + N-way cache invalidation
# (CUDA equivalent of CPU's resize!/setfield! structural invalidation)
# - Level 1: Poisoning (NaN/sentinel fill) + structural invalidation via
# _resize_without_shrink!(vec, 0) + N-way cache invalidation
# - Level 2: Poisoning + escape detection (_validate_pool_return for CuArrays)
# - Level 3: Full + borrow call-site registry + debug messages
#
# Key difference: CPU uses resize!(v, 0) at Level 1 to invalidate stale SubArrays.
# On CUDA, resize!(CuVector, 0) frees GPU memory, so we use poisoning instead.
# On CUDA, resize!(CuVector, 0) would free GPU memory, so we use
# _resize_without_shrink!(vec, 0) instead — sets dims to (0,) while preserving
# the GPU allocation (maxsize). Poisoning fills sentinel data before the shrink.

using AdaptiveArrayPools: _safety_level, _validate_pool_return,
_set_pending_callsite!, _maybe_record_borrow!,
Expand Down Expand Up @@ -57,6 +59,9 @@ end
for i in (new_n + 1):old_n_active
# Poison released CuVectors with sentinel values
_cuda_poison_fill!(@inbounds tp.vectors[i])
# Shrink logical length to 0 (GPU memory preserved via _resize_without_shrink!).
# Matches CPU behavior where resize!(vec, 0) invalidates SubArray references.
_resize_without_shrink!(@inbounds(tp.vectors[i]), 0)
# Invalidate N-way cache entries for released slots.
# After poisoning, cached views point at poisoned data — clear them so
# re-acquire creates fresh views instead of returning stale poisoned ones.
Expand Down
52 changes: 52 additions & 0 deletions test/cuda/test_allocation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,58 @@

end

@testset "_resize_without_shrink! GPU memory preservation" begin
_resize_without_shrink! = ext._resize_without_shrink!

@testset "Shrink preserves GPU pointer" begin
v = CUDA.zeros(Float32, 1000)
ptr = UInt(pointer(v))
_resize_without_shrink!(v, 100)
@test length(v) == 100
@test UInt(pointer(v)) == ptr
end

@testset "Grow-back within maxsize: no realloc" begin
v = CUDA.zeros(Float32, 1000)
ptr = UInt(pointer(v))
# Shrink first
_resize_without_shrink!(v, 100)
@test length(v) == 100
@test UInt(pointer(v)) == ptr
# Grow back to original size — maxsize preserved, so no GPU realloc
_resize_without_shrink!(v, 1000)
@test length(v) == 1000
@test UInt(pointer(v)) == ptr
end

@testset "Shrink to 0 preserves pointer" begin
v = CUDA.zeros(Float32, 500)
ptr = UInt(pointer(v))
_resize_without_shrink!(v, 0)
@test length(v) == 0
# GPU memory still allocated (not freed)
# Grow back from 0
_resize_without_shrink!(v, 500)
@test length(v) == 500
@test UInt(pointer(v)) == ptr
end

@testset "No-op when n == length" begin
v = CUDA.zeros(Float32, 200)
ptr = UInt(pointer(v))
_resize_without_shrink!(v, 200)
@test length(v) == 200
@test UInt(pointer(v)) == ptr
end

@testset "Grow beyond maxsize delegates to resize!" begin
v = CUDA.zeros(Float32, 100)
_resize_without_shrink!(v, 10_000)
@test length(v) == 10_000
# Pointer may change (new allocation) — just verify length is correct
end
end

@testset "CPU Allocation (CuArray wrapper)" begin

@testset "acquire! N-D has low CPU allocation (cache hit)" begin
Expand Down
79 changes: 57 additions & 22 deletions test/cuda/test_cuda_safety.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,22 @@ _cuda_test_leak(x) = x
end

# ==============================================================================
# Level 1: Poisoning (CUDA equivalent of CPU's resize! invalidation)
# Level 1: Poisoning + structural invalidation (length → 0)
# ==============================================================================
# CUDA Level 1 now: poison fill → _resize_without_shrink!(vec, 0)
# Backing vector length becomes 0 (GPU memory preserved via maxsize).
# Poison data persists in GPU memory and is visible on re-acquire (grow-back).

@testset "Level 1: released vectors have length 0 after rewind" begin
pool = _make_cuda_pool(1)
checkpoint!(pool)
v = acquire!(pool, Float32, 100)
CUDA.fill!(v, 42.0f0)
rewind!(pool)

# Structural invalidation: length → 0 (matches CPU behavior)
@test length(pool.float32.vectors[1]) == 0
end

@testset "Level 1: Float32 poisoned with NaN on rewind" begin
pool = _make_cuda_pool(1)
Expand All @@ -64,12 +78,10 @@ _cuda_test_leak(x) = x
CUDA.fill!(v, 42.0f0)
rewind!(pool)

# Backing vector should be poisoned with NaN (NOT freed via resize!)
@test length(pool.float32.vectors[1]) >= 10
cpu_data = Array(pool.float32.vectors[1])
@test all(isnan, cpu_data[1:10])
# Backing vector length is 0 after invalidation
@test length(pool.float32.vectors[1]) == 0

# Re-acquire: should see poisoned data
# Re-acquire: grow-back reuses same GPU memory → poison data visible
checkpoint!(pool)
v2 = acquire!(pool, Float32, 10)
@test all(isnan, Array(v2))
Expand All @@ -83,8 +95,11 @@ _cuda_test_leak(x) = x
CUDA.fill!(v, Int32(42))
rewind!(pool)

cpu_data = Array(pool.int32.vectors[1])
@test all(==(typemax(Int32)), cpu_data[1:8])
# Verify via re-acquire (backing vector length is 0 after invalidation)
checkpoint!(pool)
v2 = acquire!(pool, Int32, 8)
@test all(==(typemax(Int32)), Array(v2))
rewind!(pool)
end

@testset "Level 1: ComplexF32 poisoned with NaN on rewind" begin
Expand All @@ -94,8 +109,10 @@ _cuda_test_leak(x) = x
CUDA.fill!(v, ComplexF32(1.0f0 + 2.0f0im))
rewind!(pool)

cpu_data = Array(pool.complexf32.vectors[1])
@test all(z -> isnan(real(z)) && isnan(imag(z)), cpu_data[1:8])
checkpoint!(pool)
v2 = acquire!(pool, ComplexF32, 8)
@test all(z -> isnan(real(z)) && isnan(imag(z)), Array(v2))
rewind!(pool)
end

@testset "Level 1: Bool poisoned with true on rewind" begin
Expand All @@ -105,8 +122,10 @@ _cuda_test_leak(x) = x
CUDA.fill!(v, false)
rewind!(pool)

cpu_data = Array(pool.bool.vectors[1])
@test all(==(true), cpu_data[1:16])
checkpoint!(pool)
v2 = acquire!(pool, Bool, 16)
@test all(==(true), Array(v2))
rewind!(pool)
end

@testset "Level 1: Float16 poisoned with NaN on rewind" begin
Expand All @@ -116,8 +135,10 @@ _cuda_test_leak(x) = x
CUDA.fill!(v, Float16(42.0))
rewind!(pool)

cpu_data = Array(pool.float16.vectors[1])
@test all(isnan, cpu_data[1:10])
checkpoint!(pool)
v2 = acquire!(pool, Float16, 10)
@test all(isnan, Array(v2))
rewind!(pool)
end

@testset "Level 1: N-way cache invalidated on poisoned rewind" begin
Expand Down Expand Up @@ -330,18 +351,26 @@ _cuda_test_leak(x) = x
CUDA.fill!(v_inner, 2.0f0)
rewind!(pool)

# Inner should be poisoned (slot 2 released)
cpu_inner = Array(pool.float32.vectors[2])
@test all(isnan, cpu_inner[1:20])
# Inner should be invalidated (slot 2: length → 0, poisoned)
@test length(pool.float32.vectors[2]) == 0
# Verify poison via re-acquire
checkpoint!(pool)
v_inner2 = acquire!(pool, Float32, 20)
@test all(isnan, Array(v_inner2))
rewind!(pool)

# Outer should still be valid (slot 1 not released)
cpu_outer = Array(v_outer)
@test all(x -> x == 1.0f0, cpu_outer)

rewind!(pool)
# Now outer is also poisoned
cpu_outer_after = Array(pool.float32.vectors[1])
@test all(isnan, cpu_outer_after[1:10])
# Now outer is also invalidated (length → 0, poisoned)
@test length(pool.float32.vectors[1]) == 0
# Verify poison via re-acquire
checkpoint!(pool)
v_outer2 = acquire!(pool, Float32, 10)
@test all(isnan, Array(v_outer2))
rewind!(pool)
end

# ==============================================================================
Expand Down Expand Up @@ -410,9 +439,15 @@ _cuda_test_leak(x) = x
CUDA.fill!(v, UInt8(42))
rewind!(pool)

# Backing vector length → 0 after invalidation
tp = pool.others[UInt8]
cpu_data = Array(tp.vectors[1])
@test all(==(typemax(UInt8)), cpu_data[1:16])
@test length(tp.vectors[1]) == 0

# Verify poison via re-acquire
checkpoint!(pool)
v2 = acquire!(pool, UInt8, 16)
@test all(==(typemax(UInt8)), Array(v2))
rewind!(pool)
end

# ==============================================================================
Expand Down
Loading