diff --git a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl index 8abb3672..03c2ecf7 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl @@ -10,19 +10,6 @@ module AdaptiveArrayPoolsCUDAExt using AdaptiveArrayPools using AdaptiveArrayPools: AbstractTypedPool, AbstractArrayPool -using Preferences: @load_preference, @set_preferences! - -# N-way view cache configuration (CUDA only — CPU ≥1.11 uses slot-first _claim_slot!). -# GPU view/reshape allocates ~80 bytes on CPU heap, so caching still matters. -const CACHE_WAYS = let - ways = @load_preference("cache_ways", 4)::Int - if ways < 1 || ways > 16 - @warn "CACHE_WAYS=$ways out of range [1,16], using default 4" - 4 - else - ways - end -end using CUDA # Type definitions @@ -31,7 +18,7 @@ include("types.jl") # Dispatch methods (allocate_vector, wrap_array, get_typed_pool!) include("dispatch.jl") -# GPU-specific get_view! implementation +# GPU-specific acquire (arr_wrappers + setfield!, _resize_to_fit!, _reshape_impl!) include("acquire.jl") # Task-local pool (multi-device aware) diff --git a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl index 21a8075a..a0a77fb1 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl @@ -1,192 +1,315 @@ # ============================================================================== -# CUDA-Specific Unified get_view! Implementation (N-Way Cache) +# CUDA-Specific Acquire Implementation (arr_wrappers + setfield!) # ============================================================================== -# Unlike CPU, GPU views (view(CuVector, 1:n)) return CuVector via GPUArrays derive(), -# NOT SubArray. Similarly, reshape() returns CuArray, not ReshapedArray. -# This allows a single unified implementation for all dimensions. +# Mirrors CPU's Julia 1.11+ approach: cached CuArray{T,N} wrappers reused via +# setfield!(:dims) for zero-allocation on cache hit. # -# N-way cache layout (flat vector): -# views[(slot-1)*CACHE_WAYS + way] for way ∈ 1:CACHE_WAYS -# -# Cache lookup uses simple for loop - measured overhead ~16 bytes (acceptable). +# Key differences from CPU: +# - CPU Array has :ref (MemoryRef, GC-managed, no refcount) +# - CuArray has :data (DataRef, manual refcount via Threads.Atomic{Int}) +# - We minimize refcount overhead via wrapper.data.rc !== vec.data.rc identity +# check (~2ns). Only update :data when GPU buffer actually changed (rare). # # ============================================================================== -# Memory Resize Strategy: _resize_without_shrink! +# Memory Resize Strategy: _resize_to_fit! # ============================================================================== -# GPU vs CPU difference: -# - CPU Vector: resize!(v, smaller) preserves capacity (pointer unchanged, cheap) -# - GPU CuVector: resize!(v, smaller) may reallocate when n < capacity÷4 -# (CUDA.jl's 25% threshold triggers pool_alloc + copy + pool_free) -# -# Problem: Pool operations frequently shrink backing vectors: -# - Safety invalidation: resize!(vec, 0) to invalidate released slots -# - Acquire path: resize!(vec, smaller_size) when reusing a slot for smaller array -# Both trigger expensive GPU reallocation via CUDA.jl's 25% threshold. +# CUDA.jl resize! behavior varies by version: +# - v5.9.x: ALWAYS reallocates (no capacity management) +# - v5.10.x+: capacity check — reallocates only when n > cap or n < cap÷4 # -# Solution: _resize_without_shrink!(A, n) -# - Grow (n > length): delegates to CUDA.jl resize! (may allocate more GPU memory) -# - Shrink (n < length): setfield!(A, :dims, (n,)) — logical size only, no GPU op -# - Equal (n == length): no-op +# _resize_to_fit!(A, n): +# - n within capacity (maxsize): setfield!(:dims) only — no GPU operation +# - n beyond capacity: delegates to CUDA.jl resize! (GPU alloc) +# - n == length(A): no-op # -# Key property: maxsize is preserved on shrink. When later growing back, -# CUDA.jl computes cap = maxsize ÷ aligned_sizeof(T) and sees n ≤ cap, -# so no reallocation occurs. This is ideal for pool's borrow/return pattern. +# This is strictly better than _resize_without_shrink! (which only optimized +# shrink). _resize_to_fit! also optimizes grow-within-capacity, critical for +# re-acquire after safety invalidation (dims=(0,), maxsize preserved). # -# ⚠ Depends on CuArray internal fields (:dims, .maxsize). Tested with CUDA.jl v5.x. +# ⚠ Depends on CuArray internal fields (:data, :dims, :maxsize, :offset). +# Tested with CUDA.jl v5.x. # ============================================================================== using AdaptiveArrayPools: get_view!, get_array!, allocate_vector, safe_prod, _record_type_touch!, _fixed_slot_bit, _checkpoint_typed_pool!, + _store_arr_wrapper!, _check_pool_growth, _reshape_impl!, + _acquire_impl!, _unsafe_acquire_impl!, _maybe_record_borrow!, _MODE_BITS_MASK +using CUDA.GPUArrays: unsafe_free! + # Guard against CUDA.jl internal API changes (tested with v5.x). -# setfield!(:dims) requires CuArray to be mutable and have a :dims field. -@static if !(ismutabletype(CuArray) && hasfield(CuArray, :dims)) - error("Unsupported CUDA.jl version: expected mutable CuArray with field :dims. _resize_without_shrink! needs updating.") +@static if !( + ismutabletype(CuArray) && hasfield(CuArray, :dims) && + hasfield(CuArray, :data) && hasfield(CuArray, :maxsize) && + hasfield(CuArray, :offset) + ) + error("Unsupported CUDA.jl version: CuArray must be mutable with :data, :dims, :maxsize, :offset fields.") end +# ============================================================================== +# Aligned sizeof (mirrors CUDA.jl internal) +# ============================================================================== + +"""Compute aligned element size, matching CUDA.jl's internal `aligned_sizeof`.""" +_aligned_sizeof(::Type{T}) where {T} = max(sizeof(T), Base.datatype_alignment(T)) + +# ============================================================================== +# _resize_to_fit! — Capacity-Aware Resize (superset of _resize_without_shrink!) +# ============================================================================== + """ - _resize_without_shrink!(A::CuVector{T}, n::Integer) -> CuVector{T} + _resize_to_fit!(A::CuVector{T}, n::Integer) -> CuVector{T} -Resize a CuVector's logical length without freeing GPU memory on shrink. +Resize a CuVector's logical length, using `setfield!(:dims)` when within capacity. -- `n > length(A)`: delegates to `resize!(A, n)` (may grow GPU allocation) +- `n > capacity`: delegates to `resize!(A, n)` (may grow GPU allocation) +- `n ≤ capacity, n ≠ length(A)`: `setfield!(:dims)` only — no GPU operation - `n == length(A)`: no-op -- `n < length(A)`: only updates `dims` field (GPU memory preserved at `maxsize`) - -Avoids CUDA.jl's 25% threshold reallocation on shrink (`n < cap÷4` triggers -`pool_alloc` + `unsafe_copyto!` + `pool_free`), which is expensive for pool -operations like safety invalidation (`resize!(v, 0)`) and acquire-path resizing. -""" -@inline function _resize_without_shrink!(A::CuVector{T}, n::Integer) where {T} - current = length(A) - if n > current - resize!(A, n) # grow: delegate to CUDA.jl - elseif n < current - setfield!(A, :dims, (Int(n),)) # shrink: dims only, GPU memory preserved + +Capacity = `A.maxsize ÷ aligned_sizeof(T)`. Since `setfield!(:dims)` preserves +`maxsize`, capacity information is naturally retained across shrink/grow cycles. +""" +@inline function _resize_to_fit!(A::CuVector{T}, n::Integer) where {T} + cap = A.maxsize ÷ _aligned_sizeof(T) + if n > cap + resize!(A, n) # Beyond capacity: delegate to CUDA.jl + elseif n != length(A) + setfield!(A, :dims, (Int(n),)) # Within capacity: dims only end return A end +# ============================================================================== +# _cuda_claim_slot! — Capacity-Based Slot Claim +# ============================================================================== + """ - get_view!(tp::CuTypedPool{T}, n::Int) -> CuVector{T} + _cuda_claim_slot!(tp::CuTypedPool{T}, total_len::Int) -> Int -1D convenience wrapper - delegates to tuple version. -`(n,)` is stack-allocated (isbits NTuple), so this is zero-allocation when inlined. +Claim the next slot, ensuring the backing vector's GPU buffer has capacity ≥ `total_len`. +Uses maxsize-based capacity check instead of length check to avoid triggering +CUDA.jl's resize! unnecessarily (especially after safety invalidation sets dims=(0,)). """ -@inline function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, n::Int) where {T} - return get_view!(tp, (n,)) +@inline function _cuda_claim_slot!(tp::CuTypedPool{T}, total_len::Int) where {T} + tp.n_active += 1 + idx = tp.n_active + if idx > length(tp.vectors) + push!(tp.vectors, allocate_vector(tp, total_len)) + _check_pool_growth(tp, idx) + else + # _resize_to_fit! handles all cases: + # - n > capacity: resize! (GPU alloc) + # - n != length: setfield!(:dims) — restores length after safety invalidation + # - n == length: no-op (hot path) + _resize_to_fit!(@inbounds(tp.vectors[idx]), total_len) + end + return idx end """ - get_view!(tp::CuTypedPool{T}, dims::NTuple{N,Int}) -> CuArray{T,N} + _cuda_claim_slot!(tp::CuTypedPool{T}) -> Int -Get an N-dimensional view from the pool with unified N-way caching. -Returns cached view on hit (near-zero CPU allocation), creates new on miss. +Claim the next slot without provisioning memory (zero-length backing vector). +Used by `_reshape_impl!` which only needs the slot index for wrapper caching — +the wrapper points to a different array's memory via `setfield!(:data)`. +""" +@inline function _cuda_claim_slot!(tp::CuTypedPool{T}) where {T} + tp.n_active += 1 + idx = tp.n_active + if idx > length(tp.vectors) + push!(tp.vectors, CuVector{T}(undef, 0)) + _check_pool_growth(tp, idx) + end + return idx +end -## N-Way Cache Behavior -- Each slot has CACHE_WAYS (4) cache entries for different dimension patterns -- Cache lookup uses simple for loop (~16 bytes overhead) -- Cache replacement uses round-robin when all ways are occupied +# ============================================================================== +# _update_cuda_wrapper_data! — DataRef Refcount Management +# ============================================================================== -## GPU-Specific Behavior -- GPU `view()` returns `CuVector` (not SubArray) -- GPU `reshape()` returns `CuArray{T,N}` (not ReshapedArray) -- Both allocate ~80 bytes on CPU heap for the wrapper object -- N-way caching eliminates this allocation on cache hit +""" + _update_cuda_wrapper_data!(cu::CuArray, source::CuArray) -## Memory Resize Strategy -Backing vectors use `_resize_without_shrink!`: grow delegates to CUDA.jl's -`resize!` (may reallocate), shrink only updates `dims` (GPU memory preserved). -See module header for details. +Update wrapper's GPU data reference when the source's buffer has changed. +Decrements old refcount, increments new. @noinline: rare path (only on grow +beyond capacity), keep off the hot inlined acquire path. """ -@inline function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N} - tp.n_active += 1 - idx = tp.n_active - total_len = safe_prod(dims) +@noinline function _update_cuda_wrapper_data!(cu::CuArray, source::CuArray) + unsafe_free!(cu.data) + setfield!(cu, :data, copy(source.data)) + setfield!(cu, :maxsize, source.maxsize) + setfield!(cu, :offset, 0) + return nothing +end - # 1. Expand pool if needed (new slot) - if idx > length(tp.vectors) - push!(tp.vectors, allocate_vector(tp, total_len)) - @inbounds vec = tp.vectors[idx] - new_view = view(vec, 1:total_len) - nd_view = N == 1 ? new_view : reshape(new_view, dims) - - # Initialize N-way cache entries for this slot - for _ in 1:CACHE_WAYS - push!(tp.views, nothing) - push!(tp.view_dims, nothing) - end - push!(tp.next_way, 1) +# ============================================================================== +# _acquire_impl! / _unsafe_acquire_impl! — Direct get_array! Dispatch +# ============================================================================== +# On CUDA, both acquire! and unsafe_acquire! go through get_array! directly. +# No view/array distinction — CuArray is always returned. +# This eliminates the get_view! → get_array! indirection that CPU still uses +# for the acquire! (view) path. - # Store in first way - base = (idx - 1) * CACHE_WAYS - @inbounds tp.views[base + 1] = nd_view - @inbounds tp.view_dims[base + 1] = dims +""" + _acquire_impl!(pool::CuAdaptiveArrayPool, T, n) -> CuArray{T,1} + _acquire_impl!(pool::CuAdaptiveArrayPool, T, dims...) -> CuArray{T,N} - # Warn at powers of 2 (512, 1024, 2048, ...) - possible missing rewind!() - if idx >= 512 && (idx & (idx - 1)) == 0 - total_bytes = sum(length, tp.vectors) * sizeof(T) - @warn "CuTypedPool{$T} growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?" - end +CUDA override: routes directly to `get_array!` (no view indirection). +""" +@inline function AdaptiveArrayPools._acquire_impl!(pool::CuAdaptiveArrayPool, ::Type{T}, n::Int) where {T} + tp = get_typed_pool!(pool, T) + result = get_array!(tp, (n,)) + _maybe_record_borrow!(pool, tp) + return result +end - return nd_view - end +@inline function AdaptiveArrayPools._acquire_impl!(pool::CuAdaptiveArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} + tp = get_typed_pool!(pool, T) + result = get_array!(tp, dims) + _maybe_record_borrow!(pool, tp) + return result +end - # 2. N-way cache lookup with for loop - base = (idx - 1) * CACHE_WAYS - for k in 1:CACHE_WAYS - cache_idx = base + k - @inbounds cached_dims = tp.view_dims[cache_idx] - if cached_dims isa NTuple{N, Int} && cached_dims == dims - # Cache hit - return cached view - return @inbounds tp.views[cache_idx]::CuArray{T, N} - end - end +@inline function AdaptiveArrayPools._acquire_impl!(pool::CuAdaptiveArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N} + return _acquire_impl!(pool, T, dims...) +end - # 3. Cache miss: create new view, use round-robin replacement - @inbounds vec = tp.vectors[idx] - current_len = length(vec) - if current_len != total_len - # Resize vector to match requested size (grow or shrink). - # Uses _resize_without_shrink! to avoid GPU reallocation on shrink. - _resize_without_shrink!(vec, total_len) - # CRITICAL: on grow, _resize_without_shrink! delegates to resize! which - # may reallocate the GPU buffer (pointer change). On shrink, pointer is - # stable but length changed. Either way, cached views are stale. - # Must invalidate ALL ways to prevent returning stale/dangling views. - for k in 1:CACHE_WAYS - @inbounds tp.views[base + k] = nothing - @inbounds tp.view_dims[base + k] = nothing - end - @inbounds tp.next_way[idx] = 1 # Reset round-robin - end +""" + _unsafe_acquire_impl!(pool::CuAdaptiveArrayPool, T, dims...) -> CuArray{T,N} - new_view = view(vec, 1:total_len) - nd_view = N == 1 ? new_view : reshape(new_view, dims) +CUDA override: same as `_acquire_impl!` — both return CuArray via `get_array!`. +""" +@inline function AdaptiveArrayPools._unsafe_acquire_impl!(pool::CuAdaptiveArrayPool, ::Type{T}, n::Int) where {T} + tp = get_typed_pool!(pool, T) + result = get_array!(tp, (n,)) + _maybe_record_borrow!(pool, tp) + return result +end - # Round-robin replacement (or first way if just flushed) - @inbounds way = tp.next_way[idx] - cache_idx = base + way - @inbounds tp.views[cache_idx] = nd_view - @inbounds tp.view_dims[cache_idx] = dims - @inbounds tp.next_way[idx] = (way % CACHE_WAYS) + 1 +@inline function AdaptiveArrayPools._unsafe_acquire_impl!(pool::CuAdaptiveArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} + tp = get_typed_pool!(pool, T) + result = get_array!(tp, dims) + _maybe_record_borrow!(pool, tp) + return result +end - return nd_view +@inline function AdaptiveArrayPools._unsafe_acquire_impl!(pool::CuAdaptiveArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N} + return _unsafe_acquire_impl!(pool, T, dims...) end # ============================================================================== -# CUDA-Specific get_array! - Delegates to unified get_view! +# get_view! / get_array! — arr_wrappers + setfield! Based Zero-Alloc # ============================================================================== +# get_view! delegates to get_array! for backward compat (e.g., direct get_view! calls). +# The main acquire path now bypasses get_view! entirely via _acquire_impl! above. + +@inline function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, n::Int) where {T} + return get_array!(tp, (n,)) +end + +@inline function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N} + return get_array!(tp, dims) +end """ get_array!(tp::CuTypedPool{T}, dims::NTuple{N,Int}) -> CuArray{T,N} -Delegates to `get_view!(tp, dims)` for unified caching. -Used by `unsafe_acquire!` - same zero-allocation behavior as `acquire!`. +Get an N-dimensional `CuArray` from the pool with `setfield!`-based wrapper reuse. + +## Cache Hit (common case, 0-alloc) +1. Look up `arr_wrappers[N][slot]` +2. Check `wrapper.data.rc !== vec.data.rc` — if same GPU buffer, just `setfield!(:dims)` +3. If different (rare: only after grow beyond capacity), update `:data` via refcount management + +## Cache Miss (first call per (slot, N)) +Creates CuArray wrapper sharing backing vector's GPU memory via `copy(vec.data)`, +stores in `arr_wrappers[N][slot]` via `_store_arr_wrapper!` (reuses base module helper). """ @inline function AdaptiveArrayPools.get_array!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N} - return get_view!(tp, dims) + total_len = safe_prod(dims) + slot = _cuda_claim_slot!(tp, total_len) + @inbounds vec = tp.vectors[slot] + + # arr_wrappers lookup (direct index, no hash — same as CPU path) + wrappers = N <= length(tp.arr_wrappers) ? (@inbounds tp.arr_wrappers[N]) : nothing + if wrappers !== nothing && slot <= length(wrappers) + wrapper = @inbounds wrappers[slot] + if wrapper !== nothing + cu = wrapper::CuArray{T, N} + # Check if backing vec's GPU buffer changed (rare: only on grow beyond capacity) + if cu.data.rc !== vec.data.rc + _update_cuda_wrapper_data!(cu, vec) + end + setfield!(cu, :dims, dims) + return cu + end + end + + # Cache miss: create wrapper sharing vec's GPU memory + cu = CuArray{T, N}(copy(vec.data), dims; maxsize = vec.maxsize, offset = 0) + _store_arr_wrapper!(tp, N, slot, cu) + return cu +end + +# ============================================================================== +# _reshape_impl! for CuArray — Zero-Alloc Reshape +# ============================================================================== + +""" + _reshape_impl!(pool::CuAdaptiveArrayPool, A::CuArray{T,M}, dims::NTuple{N,Int}) -> CuArray{T,N} + +Zero-allocation reshape for CuArray using `setfield!`-based wrapper reuse. + +- **Same dimensionality (M == N)**: `setfield!(A, :dims, dims)` — no pool interaction +- **Different dimensionality (M ≠ N)**: Claims a pool slot, reuses cached `CuArray{T,N}` + wrapper with `setfield!(:dims)` pointing to `A`'s GPU memory. +""" +@inline function AdaptiveArrayPools._reshape_impl!( + pool::CuAdaptiveArrayPool, A::CuArray{T, M}, dims::NTuple{N, Int} + ) where {T, M, N} + for d in dims + d < 0 && throw(ArgumentError("invalid CuArray dimensions")) + end + total_len = safe_prod(dims) + length(A) == total_len || throw( + DimensionMismatch( + "new dimensions $(dims) must be consistent with array length $(length(A))" + ) + ) + + # 0-D reshape: rare edge case, delegate to Base (arr_wrappers is 1-indexed by N) + N == 0 && return reshape(A, dims) + + # Same dimensionality: just update dims in-place, no pool interaction + if M == N + setfield!(A, :dims, dims) + return A + end + + # Different dimensionality: claim slot + reuse cached N-D wrapper + tp = AdaptiveArrayPools.get_typed_pool!(pool, T) + _record_type_touch!(pool, T) + slot = _cuda_claim_slot!(tp) + + # Look up cached wrapper (direct index, no hash) + wrappers = N <= length(tp.arr_wrappers) ? (@inbounds tp.arr_wrappers[N]) : nothing + if wrappers !== nothing && slot <= length(wrappers) + wrapper = @inbounds wrappers[slot] + if wrapper !== nothing + cu = wrapper::CuArray{T, N} + if cu.data.rc !== A.data.rc + _update_cuda_wrapper_data!(cu, A) + end + setfield!(cu, :dims, dims) + return cu + end + end + + # Cache miss (first call per slot+N): create wrapper, cache forever + cu = CuArray{T, N}(copy(A.data), dims; maxsize = A.maxsize, offset = A.offset) + _store_arr_wrapper!(tp, N, slot, cu) + return cu end # ============================================================================== diff --git a/ext/AdaptiveArrayPoolsCUDAExt/debug.jl b/ext/AdaptiveArrayPoolsCUDAExt/debug.jl index 6dbb6a92..5c6c0405 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/debug.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/debug.jl @@ -6,18 +6,19 @@ # Safety levels on CUDA differ from CPU: # - Level 0: Zero overhead (all branches dead-code-eliminated) # - Level 1: Poisoning (NaN/sentinel fill) + structural invalidation via -# _resize_without_shrink!(vec, 0) + N-way cache invalidation +# _resize_to_fit!(vec, 0) + arr_wrappers invalidation (setfield!(:dims, zeros)) # - Level 2: Poisoning + escape detection (_validate_pool_return for CuArrays) # - Level 3: Full + borrow call-site registry + debug messages # # Key difference: CPU uses resize!(v, 0) at Level 1 to invalidate stale SubArrays. # On CUDA, resize!(CuVector, 0) would free GPU memory, so we use -# _resize_without_shrink!(vec, 0) instead — sets dims to (0,) while preserving +# _resize_to_fit!(vec, 0) instead — sets dims to (0,) while preserving # the GPU allocation (maxsize). Poisoning fills sentinel data before the shrink. +# arr_wrappers are invalidated by setting wrapper dims to zeros (matches CPU pattern). using AdaptiveArrayPools: _safety_level, _validate_pool_return, _set_pending_callsite!, _maybe_record_borrow!, - _invalidate_released_slots!, + _invalidate_released_slots!, _zero_dims_tuple, _throw_pool_escape_error, POOL_DEBUG, POOL_SAFETY_LV, PoolRuntimeEscapeError @@ -49,26 +50,29 @@ end # # Overrides the no-op fallback in base. On CUDA: # - Level 0: no-op (base _rewind_typed_pool! gates with S >= 1, so never called) -# - Level 1+: poison released CuVectors + invalidate N-way view cache -# - NO resize!(cuv, 0) — would free GPU memory +# - Level 1+: poison released CuVectors + invalidate arr_wrappers +# - NO resize!(cuv, 0) — would free GPU memory; use _resize_to_fit! instead @noinline function AdaptiveArrayPools._invalidate_released_slots!( tp::CuTypedPool{T}, old_n_active::Int, S::Int ) where {T} new_n = tp.n_active + # Poison released CuVectors + shrink logical length to 0 for i in (new_n + 1):old_n_active - # Poison released CuVectors with sentinel values _cuda_poison_fill!(@inbounds tp.vectors[i]) - # Shrink logical length to 0 (GPU memory preserved via _resize_without_shrink!). + # Shrink logical length to 0 (GPU memory preserved via _resize_to_fit!). # Matches CPU behavior where resize!(vec, 0) invalidates SubArray references. - _resize_without_shrink!(@inbounds(tp.vectors[i]), 0) - # Invalidate N-way cache entries for released slots. - # After poisoning, cached views point at poisoned data — clear them so - # re-acquire creates fresh views instead of returning stale poisoned ones. - base = (i - 1) * CACHE_WAYS - for k in 1:CACHE_WAYS - @inbounds tp.views[base + k] = nothing - @inbounds tp.view_dims[base + k] = nothing + _resize_to_fit!(@inbounds(tp.vectors[i]), 0) + end + # Invalidate arr_wrappers for released slots (matches CPU pattern from src/state.jl) + for N_idx in 1:length(tp.arr_wrappers) + wrappers_for_N = @inbounds tp.arr_wrappers[N_idx] + wrappers_for_N === nothing && continue + wrappers = wrappers_for_N::Vector{Any} + for i in (new_n + 1):min(old_n_active, length(wrappers)) + wrapper = @inbounds wrappers[i] + wrapper === nothing && continue + setfield!(wrapper::CuArray, :dims, _zero_dims_tuple(N_idx)) end end return nothing diff --git a/ext/AdaptiveArrayPoolsCUDAExt/state.jl b/ext/AdaptiveArrayPoolsCUDAExt/state.jl index 80a28ee2..be5b5dae 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/state.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/state.jl @@ -316,9 +316,7 @@ CUDA.reclaim() """ function Base.empty!(tp::CuTypedPool) empty!(tp.vectors) - empty!(tp.views) - empty!(tp.view_dims) - empty!(tp.next_way) + empty!(tp.arr_wrappers) tp.n_active = 0 # Restore sentinel values empty!(tp._checkpoint_n_active) diff --git a/ext/AdaptiveArrayPoolsCUDAExt/types.jl b/ext/AdaptiveArrayPoolsCUDAExt/types.jl index 90f40741..b4f03b6c 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/types.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/types.jl @@ -3,44 +3,38 @@ # ============================================================================== # Note: Unlike CPU, view(CuVector, 1:n) returns CuVector (via GPUArrays derive()), -# NOT SubArray. However, we still cache view objects to avoid CPU heap allocation -# (~80 bytes per call) for the CuVector metadata wrapper. - -# Note: Uses shared CACHE_WAYS constant from main module for consistency. +# NOT SubArray. GPU view/reshape creation allocates ~80 bytes on CPU heap for the +# CuArray wrapper. We cache wrappers via arr_wrappers to achieve zero-allocation +# on cache hit (same approach as CPU's setfield!-based Array wrapper reuse). """ CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}} -GPU memory pool for element type `T`. Uses unified N-way view caching for all dimensions. +GPU memory pool for element type `T`. Uses `arr_wrappers`-based CuArray reuse +for zero-allocation acquire (same design as CPU TypedPool on Julia 1.11+). ## Fields - `vectors`: Backing `CuVector{T}` storage (one per slot) -- `views`: Flat N-way cache storing CuArray of any dimension - - Layout: `views[(slot-1)*CACHE_WAYS + way]` for way ∈ 1:CACHE_WAYS -- `view_dims`: Cached dims corresponding to views -- `next_way`: Round-robin counter per slot for cache replacement +- `arr_wrappers`: `Vector{Union{Nothing, Vector{Any}}}` — indexed by N (dimensionality), + each entry is a per-slot cached `CuArray{T,N}` wrapper. Uses `setfield!(wrapper, :dims, dims)` + for zero-allocation reuse of unlimited dimension patterns within the same N. + When the backing vector's GPU buffer changes (rare: only on grow beyond capacity), + the wrapper's `:data` field is updated via DataRef refcount management. - State management fields (same as CPU) ## Design Note -Unlike CPU where view() returns SubArray and reshape() returns ReshapedArray, -CUDA returns CuArray for both operations. This allows a unified cache that -stores CuArray{T,N} for any N, eliminating the need for separate 1D/N-D caches. - -GPU view/reshape creation allocates ~80 bytes on CPU heap for the CuArray -wrapper object. N-way caching with for-loop lookup eliminates this allocation -when the same dimensions pattern is requested again. +Unlike CPU where `setfield!(:ref, MemoryRef)` is free (GC-managed), +CuArray's `:data` field is `DataRef` with manual refcounting. We minimize this cost +via `wrapper.data.rc !== vec.data.rc` identity check (~2ns): only update `:data` +when the backing vector's GPU buffer actually changed. The common case (same buffer) +is a simple `setfield!(:dims)` — truly zero-allocation. """ mutable struct CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}} # --- Storage --- vectors::Vector{CuVector{T}} - # --- Unified N-Way View Cache (flat layout) --- - # Length = n_slots * CACHE_WAYS - views::Vector{Any} # CuArray{T,N} for any N - view_dims::Vector{Any} # NTuple{N,Int} or nothing - - # --- Cache Replacement (round-robin per slot) --- - next_way::Vector{Int} # next_way[slot] ∈ 1:CACHE_WAYS + # --- N-D Wrapper Cache (setfield!-based reuse, matches CPU TypedPool) --- + arr_wrappers::Vector{Union{Nothing, Vector{Any}}} # index=N (dimensionality), value=per-slot CuArray{T,N} # --- State Management (1-based sentinel pattern) --- n_active::Int @@ -50,11 +44,9 @@ end function CuTypedPool{T}() where {T} return CuTypedPool{T}( - CuVector{T}[], # vectors - Any[], # views (N-way flat cache) - Any[], # view_dims - Int[], # next_way (round-robin counters) - 0, [0], [0] # State (1-based sentinel) + CuVector{T}[], # vectors + Union{Nothing, Vector{Any}}[], # arr_wrappers (indexed by N) + 0, [0], [0] # State (1-based sentinel) ) end diff --git a/src/legacy/state.jl b/src/legacy/state.jl index 724c53d2..7d6d1868 100644 --- a/src/legacy/state.jl +++ b/src/legacy/state.jl @@ -4,6 +4,17 @@ # Identical to v1.11+ state.jl except empty! clears legacy N-way cache fields # (nd_arrays, nd_dims, nd_ptrs, nd_next_way) instead of nd_wrappers. +# Zero-dims tuple for wrapper invalidation. Literal tuples for N ≤ 4 avoid +# ntuple(_ -> 0, N) dynamic-dispatch allocation (runtime N → heterogeneous +# return type → boxing). Falls back to ntuple for N > 4 (extremely rare). +@inline function _zero_dims_tuple(N::Int) + N == 1 && return (0,) + N == 2 && return (0, 0) + N == 3 && return (0, 0, 0) + N == 4 && return (0, 0, 0, 0) + return ntuple(_ -> 0, N) +end + # ============================================================================== # State Management - checkpoint! # ============================================================================== @@ -300,7 +311,7 @@ end dims === nothing && continue N = length(dims::Tuple) setfield!(ba::BitArray, :len, 0) - setfield!(ba::BitArray, :dims, ntuple(_ -> 0, N)) + setfield!(ba::BitArray, :dims, _zero_dims_tuple(N)) end end return nothing diff --git a/src/state.jl b/src/state.jl index 0d6331b6..9903c0b9 100644 --- a/src/state.jl +++ b/src/state.jl @@ -244,6 +244,17 @@ end _invalidate_released_slots!(::AbstractTypedPool, ::Int, ::Int) = nothing _invalidate_released_slots!(::AbstractTypedPool, ::Int) = nothing # legacy 2-arg compat +# Zero-dims tuple for wrapper invalidation. Literal tuples for N ≤ 4 avoid +# ntuple(_ -> 0, N) dynamic-dispatch allocation (runtime N → heterogeneous +# return type → boxing). Falls back to ntuple for N > 4 (extremely rare). +@inline function _zero_dims_tuple(N::Int) + N == 1 && return (0,) + N == 2 && return (0, 0) + N == 3 && return (0, 0, 0) + N == 4 && return (0, 0, 0, 0) + return ntuple(_ -> 0, N) +end + @noinline function _invalidate_released_slots!(tp::TypedPool{T}, old_n_active::Int, S::Int = POOL_SAFETY_LV[]) where {T} new_n = tp.n_active # Level 2+: poison vectors with NaN/sentinel before structural invalidation @@ -262,7 +273,7 @@ _invalidate_released_slots!(::AbstractTypedPool, ::Int) = nothing # legacy 2-ar for i in (new_n + 1):min(old_n_active, length(wrappers)) wrapper = @inbounds wrappers[i] wrapper === nothing && continue - setfield!(wrapper::Array, :size, ntuple(_ -> 0, N_idx)) + setfield!(wrapper::Array, :size, _zero_dims_tuple(N_idx)) end end return nothing @@ -288,7 +299,7 @@ end wrapper === nothing && continue ba = wrapper::BitArray setfield!(ba, :len, 0) - setfield!(ba, :dims, ntuple(_ -> 0, N_idx)) + setfield!(ba, :dims, _zero_dims_tuple(N_idx)) end end return nothing diff --git a/test/cuda/test_allocation.jl b/test/cuda/test_allocation.jl index 7a5ff1b1..974b3f90 100644 --- a/test/cuda/test_allocation.jl +++ b/test/cuda/test_allocation.jl @@ -169,53 +169,68 @@ end -@testset "_resize_without_shrink! GPU memory preservation" begin - _resize_without_shrink! = ext._resize_without_shrink! +@testset "_resize_to_fit! GPU memory preservation" begin + _resize_to_fit! = ext._resize_to_fit! @testset "Shrink preserves GPU pointer" begin v = CUDA.zeros(Float32, 1000) ptr = UInt(pointer(v)) - _resize_without_shrink!(v, 100) + _resize_to_fit!(v, 100) @test length(v) == 100 @test UInt(pointer(v)) == ptr end - @testset "Grow-back within maxsize: no realloc" begin + @testset "Grow-back within capacity: no realloc" begin v = CUDA.zeros(Float32, 1000) ptr = UInt(pointer(v)) # Shrink first - _resize_without_shrink!(v, 100) + _resize_to_fit!(v, 100) @test length(v) == 100 @test UInt(pointer(v)) == ptr # Grow back to original size — maxsize preserved, so no GPU realloc - _resize_without_shrink!(v, 1000) + _resize_to_fit!(v, 1000) @test length(v) == 1000 @test UInt(pointer(v)) == ptr end - @testset "Shrink to 0 preserves pointer" begin + @testset "Shrink to 0, grow back preserves pointer" begin v = CUDA.zeros(Float32, 500) ptr = UInt(pointer(v)) - _resize_without_shrink!(v, 0) + _resize_to_fit!(v, 0) @test length(v) == 0 # GPU memory still allocated (not freed) - # Grow back from 0 - _resize_without_shrink!(v, 500) + # Grow back from 0 — within capacity, so no GPU realloc + _resize_to_fit!(v, 500) @test length(v) == 500 @test UInt(pointer(v)) == ptr end + @testset "Grow within capacity after invalidation: no realloc" begin + # This is the key test: after safety invalidation (dims→0), + # re-acquire within original capacity should NOT trigger GPU realloc. + # (CUDA.jl v5.9.x resize! would always reallocate; _resize_to_fit! avoids this) + v = CUDA.zeros(Float32, 1000) + ptr = UInt(pointer(v)) + # Simulate safety invalidation + _resize_to_fit!(v, 0) + @test length(v) == 0 + # Re-acquire at smaller size (still within original capacity) + _resize_to_fit!(v, 200) + @test length(v) == 200 + @test UInt(pointer(v)) == ptr # Same GPU buffer + end + @testset "No-op when n == length" begin v = CUDA.zeros(Float32, 200) ptr = UInt(pointer(v)) - _resize_without_shrink!(v, 200) + _resize_to_fit!(v, 200) @test length(v) == 200 @test UInt(pointer(v)) == ptr end - @testset "Grow beyond maxsize delegates to resize!" begin + @testset "Grow beyond capacity delegates to resize!" begin v = CUDA.zeros(Float32, 100) - _resize_without_shrink!(v, 10_000) + _resize_to_fit!(v, 10_000) @test length(v) == 10_000 # Pointer may change (new allocation) — just verify length is correct end diff --git a/test/cuda/test_cuda_safety.jl b/test/cuda/test_cuda_safety.jl index a91e5d47..7df427f3 100644 --- a/test/cuda/test_cuda_safety.jl +++ b/test/cuda/test_cuda_safety.jl @@ -56,7 +56,7 @@ _cuda_test_leak(x) = x # ============================================================================== # Level 1: Poisoning + structural invalidation (length → 0) # ============================================================================== - # CUDA Level 1 now: poison fill → _resize_without_shrink!(vec, 0) + # CUDA Level 1 now: poison fill → _resize_to_fit!(vec, 0) + arr_wrappers invalidation # Backing vector length becomes 0 (GPU memory preserved via maxsize). # Poison data persists in GPU memory and is visible on re-acquire (grow-back). @@ -141,17 +141,22 @@ _cuda_test_leak(x) = x rewind!(pool) end - @testset "Level 1: N-way cache invalidated on poisoned rewind" begin + @testset "Level 1: arr_wrappers invalidated on poisoned rewind" begin pool = _make_cuda_pool(1) checkpoint!(pool) v = acquire!(pool, Float32, 10) CUDA.fill!(v, 1.0f0) rewind!(pool) - # Cached views should be cleared (nothing) after poisoning - base = 0 * ext.CACHE_WAYS - for k in 1:ext.CACHE_WAYS - @test pool.float32.views[base + k] === nothing + # arr_wrappers for released slots should have zero-dims after invalidation + tp = pool.float32 + for N_idx in 1:length(tp.arr_wrappers) + wrappers_for_N = tp.arr_wrappers[N_idx] + wrappers_for_N === nothing && continue + for wrapper in wrappers_for_N + wrapper === nothing && continue + @test all(==(0), size(wrapper)) + end end end diff --git a/test/cuda/test_extension.jl b/test/cuda/test_extension.jl index 01753a3e..ba6c8df4 100644 --- a/test/cuda/test_extension.jl +++ b/test/cuda/test_extension.jl @@ -6,10 +6,8 @@ tp_fields = fieldnames(CuTypedPool) @test :vectors in tp_fields @test :n_active in tp_fields - # N-way cache fields - @test :views in tp_fields - @test :view_dims in tp_fields - @test :next_way in tp_fields # Round-robin counter + # arr_wrappers (setfield!-based wrapper reuse, replaces N-way cache) + @test :arr_wrappers in tp_fields # State management @test :_checkpoint_n_active in tp_fields @test :_checkpoint_depths in tp_fields diff --git a/test/cuda/test_nway_cache.jl b/test/cuda/test_nway_cache.jl index 526c5e5c..7c76f7a0 100644 --- a/test/cuda/test_nway_cache.jl +++ b/test/cuda/test_nway_cache.jl @@ -1,8 +1,10 @@ -# CUDA N-way Cache Tests -# Verifies N-way cache behavior for CuArray wrapper reuse -# Key: 4-way cache means 4 dimension patterns = zero-alloc, 5+ = allocation +# CUDA arr_wrappers Tests +# Verifies setfield!-based CuArray wrapper reuse for zero-allocation acquire. +# Key: arr_wrappers[N][slot] caches one CuArray{T,N} per (dimensionality N, slot). +# Same N = setfield!(:dims) → zero-alloc for unlimited dimension patterns. +# Different N = separate wrapper per N. -@testset "N-way Cache Types" begin +@testset "CuArray Wrapper Types" begin @testset "acquire! returns CuArray" begin @with_pool :cuda pool begin @@ -28,32 +30,24 @@ end end - @testset "CACHE_WAYS configuration" begin - # CACHE_WAYS is defined in CUDA extension (CPU ≥1.11 no longer uses view caching) - CUDAExt = Base.get_extension(AdaptiveArrayPools, :AdaptiveArrayPoolsCUDAExt) - @test CUDAExt.CACHE_WAYS isa Int - @test 1 <= CUDAExt.CACHE_WAYS <= 16 - end - end -@testset "N-way Cache Behavior" begin +@testset "arr_wrappers: Unlimited Same-N Patterns" begin - # Key principles: - # 1. GPU allocation should ALWAYS be 0 (memory reused from pool) - # 2. CPU allocation: cache hit (4-way) = 0, cache miss (5-way) = >0 + # With arr_wrappers, same-N dimension patterns use setfield!(:dims). + # Unlike the old 4-way cache, there is NO eviction — unlimited patterns per N. # ========================================================================= # GPU Allocation Tests (with fill! to actually use the arrays) # ========================================================================= - @testset "GPU: 4-way zero-alloc" begin + @testset "GPU: 4 patterns zero-alloc" begin pool = get_task_local_cuda_pool() reset!(pool) dims_list = ((10, 10), (5, 20), (20, 5), (4, 25)) - function test_4way_gpu() + function test_4pat_gpu() for dims in dims_list @with_pool :cuda p begin A = acquire!(p, Float64, dims...) @@ -63,21 +57,21 @@ end end # Warmup - test_4way_gpu() - test_4way_gpu() + test_4pat_gpu() + test_4pat_gpu() GC.gc(); CUDA.reclaim() - gpu_alloc = CUDA.@allocated test_4way_gpu() + gpu_alloc = CUDA.@allocated test_4pat_gpu() @test gpu_alloc == 0 end - @testset "GPU: 5-way zero-alloc (even with cache miss)" begin + @testset "GPU: 5+ patterns zero-alloc (no eviction with arr_wrappers)" begin pool = get_task_local_cuda_pool() reset!(pool) dims_list = ((10, 10), (5, 20), (20, 5), (4, 25), (2, 50)) - function test_5way_gpu() + function test_5pat_gpu() for dims in dims_list @with_pool :cuda p begin A = acquire!(p, Float64, dims...) @@ -87,11 +81,11 @@ end end # Warmup - test_5way_gpu() - test_5way_gpu() + test_5pat_gpu() + test_5pat_gpu() GC.gc(); CUDA.reclaim() - gpu_alloc = CUDA.@allocated test_5way_gpu() + gpu_alloc = CUDA.@allocated test_5pat_gpu() @test gpu_alloc == 0 end @@ -99,13 +93,13 @@ end # CPU Allocation Tests (no fill! to avoid CUDA kernel overhead) # ========================================================================= - @testset "CPU: 4-way zero-alloc (cache hit)" begin + @testset "CPU: 4 patterns zero-alloc" begin pool = get_task_local_cuda_pool() reset!(pool) dims_list = ((10, 10), (5, 20), (20, 5), (4, 25)) - function test_4way_cpu() + function test_4pat_cpu() for dims in dims_list @with_pool :cuda p begin _ = acquire!(p, Float64, dims...) @@ -114,21 +108,22 @@ end end # Warmup - test_4way_cpu() - test_4way_cpu() + test_4pat_cpu() + test_4pat_cpu() GC.gc() - cpu_alloc = @allocated test_4way_cpu() - @test cpu_alloc == 0 # 4 patterns fit in 4-way cache + cpu_alloc = @allocated test_4pat_cpu() + @test cpu_alloc == 0 end - @testset "CPU: 5-way causes allocation (cache miss)" begin + @testset "CPU: 5+ patterns zero-alloc (arr_wrappers: same-N uses setfield!)" begin pool = get_task_local_cuda_pool() reset!(pool) + # All 2D — same N=2 → single wrapper per slot, setfield!(:dims) only dims_list = ((10, 10), (5, 20), (20, 5), (4, 25), (2, 50)) - function test_5way_cpu() + function test_5pat_cpu() for dims in dims_list @with_pool :cuda p begin _ = acquire!(p, Float64, dims...) @@ -137,25 +132,25 @@ end end # Warmup - test_5way_cpu() - test_5way_cpu() + test_5pat_cpu() + test_5pat_cpu() GC.gc() - cpu_alloc = @allocated test_5way_cpu() - @test cpu_alloc > 0 # 5 patterns exceed 4-way cache + cpu_alloc = @allocated test_5pat_cpu() + @test cpu_alloc == 0 # No eviction — arr_wrappers support unlimited same-N patterns end # ========================================================================= # unsafe_acquire! Tests # ========================================================================= - @testset "unsafe_acquire! GPU: 4-way zero-alloc" begin + @testset "unsafe_acquire! GPU: 4 patterns zero-alloc" begin pool = get_task_local_cuda_pool() reset!(pool) dims_list = ((8, 8), (4, 16), (16, 4), (2, 32)) - function test_unsafe_4way_gpu() + function test_unsafe_4pat_gpu() for dims in dims_list @with_pool :cuda p begin A = unsafe_acquire!(p, Float64, dims...) @@ -165,21 +160,21 @@ end end # Warmup - test_unsafe_4way_gpu() - test_unsafe_4way_gpu() + test_unsafe_4pat_gpu() + test_unsafe_4pat_gpu() GC.gc(); CUDA.reclaim() - gpu_alloc = CUDA.@allocated test_unsafe_4way_gpu() + gpu_alloc = CUDA.@allocated test_unsafe_4pat_gpu() @test gpu_alloc == 0 end - @testset "unsafe_acquire! CPU: 4-way zero-alloc" begin + @testset "unsafe_acquire! CPU: 4 patterns zero-alloc" begin pool = get_task_local_cuda_pool() reset!(pool) dims_list = ((8, 8), (4, 16), (16, 4), (2, 32)) - function test_unsafe_4way_cpu() + function test_unsafe_4pat_cpu() for dims in dims_list @with_pool :cuda p begin _ = unsafe_acquire!(p, Float64, dims...) @@ -188,21 +183,21 @@ end end # Warmup - test_unsafe_4way_cpu() - test_unsafe_4way_cpu() + test_unsafe_4pat_cpu() + test_unsafe_4pat_cpu() GC.gc() - cpu_alloc = @allocated test_unsafe_4way_cpu() + cpu_alloc = @allocated test_unsafe_4pat_cpu() @test cpu_alloc == 0 end - @testset "unsafe_acquire! CPU: 5-way causes allocation" begin + @testset "unsafe_acquire! CPU: 5+ patterns zero-alloc" begin pool = get_task_local_cuda_pool() reset!(pool) dims_list = ((8, 8), (4, 16), (16, 4), (2, 32), (32, 2)) - function test_unsafe_5way_cpu() + function test_unsafe_5pat_cpu() for dims in dims_list @with_pool :cuda p begin _ = unsafe_acquire!(p, Float64, dims...) @@ -211,17 +206,248 @@ end end # Warmup - test_unsafe_5way_cpu() - test_unsafe_5way_cpu() + test_unsafe_5pat_cpu() + test_unsafe_5pat_cpu() + GC.gc() + + cpu_alloc = @allocated test_unsafe_5pat_cpu() + @test cpu_alloc == 0 # arr_wrappers: unlimited same-N patterns + end + +end + +@testset "arr_wrappers: Mixed-N Patterns (1D + 2D + 3D)" begin + + # arr_wrappers[N][slot] caches a separate wrapper per dimensionality N. + # Same slot, different N → each N gets its own wrapper (first use = cache miss). + # After warmup of all (slot, N) combos → zero-alloc for any mix. + + # ========================================================================= + # GPU Allocation Tests + # ========================================================================= + + @testset "GPU: 1D + 2D + 3D mixed acquire zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Each @with_pool scope resets slot counter → slot 1 gets 1D/2D/3D wrappers + function test_mixed_n_gpu() + @with_pool :cuda p begin + v = acquire!(p, Float64, 100) # 1D — arr_wrappers[1][1] + fill!(v, 1.0) + end + @with_pool :cuda p begin + A = acquire!(p, Float64, 10, 10) # 2D — arr_wrappers[2][1] + fill!(A, 2.0) + end + @with_pool :cuda p begin + T = acquire!(p, Float64, 5, 5, 4) # 3D — arr_wrappers[3][1] + fill!(T, 3.0) + end + end + + # Warmup (populates all 3 wrappers per N) + test_mixed_n_gpu() + test_mixed_n_gpu() + GC.gc(); CUDA.reclaim() + + gpu_alloc = CUDA.@allocated test_mixed_n_gpu() + @test gpu_alloc == 0 + end + + @testset "GPU: mixed-N with varying dims zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Vary dims within each N across iterations + function test_mixed_n_varying_gpu() + for (d1, d2, d3) in ( + (100, (10, 10), (5, 5, 4)), + (200, (5, 20), (4, 5, 10)), + (50, (20, 5), (2, 5, 10)), + ) + @with_pool :cuda p begin + v = acquire!(p, Float64, d1) + fill!(v, 1.0) + end + @with_pool :cuda p begin + A = acquire!(p, Float64, d2...) + fill!(A, 2.0) + end + @with_pool :cuda p begin + T = acquire!(p, Float64, d3...) + fill!(T, 3.0) + end + end + end + + # Warmup + test_mixed_n_varying_gpu() + test_mixed_n_varying_gpu() + GC.gc(); CUDA.reclaim() + + gpu_alloc = CUDA.@allocated test_mixed_n_varying_gpu() + @test gpu_alloc == 0 + end + + @testset "GPU: multi-slot mixed-N zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Multiple acquires within one scope → different slots, different N + function test_multi_slot_mixed_n_gpu() + @with_pool :cuda p begin + v = acquire!(p, Float64, 100) # Slot 1, 1D + A = acquire!(p, Float64, 10, 10) # Slot 2, 2D + T = acquire!(p, Float64, 5, 5, 4) # Slot 3, 3D + fill!(v, 1.0) + fill!(A, 2.0) + fill!(T, 3.0) + end + end + + # Warmup + test_multi_slot_mixed_n_gpu() + test_multi_slot_mixed_n_gpu() + GC.gc(); CUDA.reclaim() + + gpu_alloc = CUDA.@allocated test_multi_slot_mixed_n_gpu() + @test gpu_alloc == 0 + end + + # ========================================================================= + # CPU Allocation Tests (no fill! — wrapper creation overhead only) + # ========================================================================= + + @testset "CPU: 1D + 2D + 3D mixed acquire zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + function test_mixed_n_cpu() + @with_pool :cuda p begin + _ = acquire!(p, Float64, 100) # 1D + end + @with_pool :cuda p begin + _ = acquire!(p, Float64, 10, 10) # 2D + end + @with_pool :cuda p begin + _ = acquire!(p, Float64, 5, 5, 4) # 3D + end + end + + # Warmup + test_mixed_n_cpu() + test_mixed_n_cpu() + GC.gc() + + cpu_alloc = @allocated test_mixed_n_cpu() + @test cpu_alloc == 0 + end + + @testset "CPU: mixed-N with varying dims zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + function test_mixed_n_varying_cpu() + for (d1, d2, d3) in ( + (100, (10, 10), (5, 5, 4)), + (200, (5, 20), (4, 5, 10)), + (50, (20, 5), (2, 5, 10)), + ) + @with_pool :cuda p begin + _ = acquire!(p, Float64, d1) + end + @with_pool :cuda p begin + _ = acquire!(p, Float64, d2...) + end + @with_pool :cuda p begin + _ = acquire!(p, Float64, d3...) + end + end + end + + # Warmup + test_mixed_n_varying_cpu() + test_mixed_n_varying_cpu() + GC.gc() + + cpu_alloc = @allocated test_mixed_n_varying_cpu() + @test cpu_alloc == 0 + end + + @testset "CPU: multi-slot mixed-N zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + function test_multi_slot_mixed_n_cpu() + @with_pool :cuda p begin + _ = acquire!(p, Float64, 100) # Slot 1, 1D + _ = acquire!(p, Float64, 10, 10) # Slot 2, 2D + _ = acquire!(p, Float64, 5, 5, 4) # Slot 3, 3D + end + end + + # Warmup + test_multi_slot_mixed_n_cpu() + test_multi_slot_mixed_n_cpu() GC.gc() - cpu_alloc = @allocated test_unsafe_5way_cpu() - @test cpu_alloc > 0 + cpu_alloc = @allocated test_multi_slot_mixed_n_cpu() + @test cpu_alloc == 0 + end + + # ========================================================================= + # unsafe_acquire! Mixed-N Tests + # ========================================================================= + + @testset "unsafe_acquire! GPU: mixed-N zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + function test_unsafe_mixed_n_gpu() + @with_pool :cuda p begin + v = unsafe_acquire!(p, Float64, 100) + A = unsafe_acquire!(p, Float64, 10, 10) + T = unsafe_acquire!(p, Float64, 5, 5, 4) + fill!(v, 1.0) + fill!(A, 2.0) + fill!(T, 3.0) + end + end + + # Warmup + test_unsafe_mixed_n_gpu() + test_unsafe_mixed_n_gpu() + GC.gc(); CUDA.reclaim() + + gpu_alloc = CUDA.@allocated test_unsafe_mixed_n_gpu() + @test gpu_alloc == 0 + end + + @testset "unsafe_acquire! CPU: mixed-N zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + function test_unsafe_mixed_n_cpu() + @with_pool :cuda p begin + _ = unsafe_acquire!(p, Float64, 100) + _ = unsafe_acquire!(p, Float64, 10, 10) + _ = unsafe_acquire!(p, Float64, 5, 5, 4) + end + end + + # Warmup + test_unsafe_mixed_n_cpu() + test_unsafe_mixed_n_cpu() + GC.gc() + + cpu_alloc = @allocated test_unsafe_mixed_n_cpu() + @test cpu_alloc == 0 end end -@testset "N-way Cache: Loop Patterns" begin +@testset "arr_wrappers: Loop Patterns" begin @testset "100 iterations: GPU always zero-alloc" begin pool = get_task_local_cuda_pool() @@ -229,7 +455,7 @@ end dims_list = ((10, 10), (5, 20), (20, 5), (4, 25)) - function test_loop_4way() + function test_loop_4pat() for _ in 1:100 for dims in dims_list @with_pool :cuda p begin @@ -241,20 +467,20 @@ end end # Warmup - test_loop_4way() + test_loop_4pat() GC.gc(); CUDA.reclaim() - gpu_alloc = CUDA.@allocated test_loop_4way() + gpu_alloc = CUDA.@allocated test_loop_4pat() @test gpu_alloc == 0 # GPU memory always reused end - @testset "100 iterations with 5 patterns: GPU still zero-alloc" begin + @testset "100 iterations with 5+ patterns: GPU still zero-alloc" begin pool = get_task_local_cuda_pool() reset!(pool) dims_list = ((10, 10), (5, 20), (20, 5), (4, 25), (2, 50)) - function test_loop_5way() + function test_loop_5pat() for _ in 1:100 for dims in dims_list @with_pool :cuda p begin @@ -266,16 +492,16 @@ end end # Warmup - test_loop_5way() + test_loop_5pat() GC.gc(); CUDA.reclaim() - gpu_alloc = CUDA.@allocated test_loop_5way() - @test gpu_alloc == 0 # GPU memory reused even with cache thrashing + gpu_alloc = CUDA.@allocated test_loop_5pat() + @test gpu_alloc == 0 # GPU memory reused — no cache eviction end end -@testset "N-way Cache: Multiple Slots" begin +@testset "arr_wrappers: Multiple Slots" begin @testset "Multiple arrays per iteration: GPU zero-alloc" begin pool = get_task_local_cuda_pool() @@ -306,7 +532,6 @@ end reset!(pool) # Use same dims for both slots, just vary across iterations - # This tests GPU memory reuse, not cache behavior dims_list = ((10, 10), (5, 20), (20, 5), (4, 25)) function test_multi_slot_varying() @@ -331,7 +556,126 @@ end end -@testset "N-way Cache: Resize Behavior" begin +@testset "arr_wrappers: reshape! Zero-Alloc" begin + + # _reshape_impl! for CuArray uses arr_wrappers cache for cross-dim reshape, + # and in-place setfield!(:dims) for same-dim reshape (no pool interaction). + + @testset "GPU: cross-dim reshape zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + function test_reshape_cross_dim_gpu() + @with_pool :cuda p begin + A = acquire!(p, Float64, 12) + CUDA.fill!(A, 1.0) + # 1D → 2D (cross-dim: claims slot, uses arr_wrappers[2]) + B = reshape!(p, A, 3, 4) + CUDA.fill!(B, 2.0) + end + end + + # Warmup + test_reshape_cross_dim_gpu() + test_reshape_cross_dim_gpu() + GC.gc(); CUDA.reclaim() + + gpu_alloc = CUDA.@allocated test_reshape_cross_dim_gpu() + @test gpu_alloc == 0 + end + + @testset "CPU: cross-dim reshape zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + function test_reshape_cross_dim_cpu() + @with_pool :cuda p begin + A = acquire!(p, Float64, 12) + B = reshape!(p, A, 3, 4) + end + end + + test_reshape_cross_dim_cpu() + test_reshape_cross_dim_cpu() + GC.gc() + + cpu_alloc = @allocated test_reshape_cross_dim_cpu() + @test cpu_alloc == 0 + end + + @testset "GPU: same-dim reshape zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + function test_reshape_same_dim_gpu() + @with_pool :cuda p begin + A = acquire!(p, Float64, 3, 4) + CUDA.fill!(A, 1.0) + # 2D → 2D (same-dim: in-place setfield!, no pool interaction) + B = reshape!(p, A, 4, 3) + CUDA.fill!(B, 2.0) + end + end + + test_reshape_same_dim_gpu() + test_reshape_same_dim_gpu() + GC.gc(); CUDA.reclaim() + + gpu_alloc = CUDA.@allocated test_reshape_same_dim_gpu() + @test gpu_alloc == 0 + end + + @testset "GPU: mixed reshape sequence zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + function test_reshape_mixed_gpu() + @with_pool :cuda p begin + A = acquire!(p, Float64, 24) + CUDA.fill!(A, 1.0) + B = reshape!(p, A, 4, 6) # 1D → 2D + C = reshape!(p, A, 2, 3, 4) # 1D → 3D + CUDA.fill!(B, 2.0) + CUDA.fill!(C, 3.0) + end + end + + test_reshape_mixed_gpu() + test_reshape_mixed_gpu() + GC.gc(); CUDA.reclaim() + + gpu_alloc = CUDA.@allocated test_reshape_mixed_gpu() + @test gpu_alloc == 0 + end + + @testset "Correctness: data sharing through reshape" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + @with_pool :cuda p begin + A = acquire!(p, Float64, 12) + CUDA.fill!(A, 1.0) + B = reshape!(p, A, 3, 4) + @test size(B) == (3, 4) + @test B isa CuArray{Float64, 2} + # Data identity: B shares GPU memory with A + @test length(B) == length(A) + end + end + + @testset "DimensionMismatch" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + @with_pool :cuda p begin + A = acquire!(p, Float64, 12) + @test_throws DimensionMismatch reshape!(p, A, 5, 5) + end + end + +end + +@testset "arr_wrappers: Resize Behavior" begin @testset "Resize: GPU zero-alloc maintained" begin pool = get_task_local_cuda_pool()