Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 1 addition & 14 deletions ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,6 @@ module AdaptiveArrayPoolsCUDAExt

using AdaptiveArrayPools
using AdaptiveArrayPools: AbstractTypedPool, AbstractArrayPool
using Preferences: @load_preference, @set_preferences!

# N-way view cache configuration (CUDA only — CPU ≥1.11 uses slot-first _claim_slot!).
# GPU view/reshape allocates ~80 bytes on CPU heap, so caching still matters.
const CACHE_WAYS = let
ways = @load_preference("cache_ways", 4)::Int
if ways < 1 || ways > 16
@warn "CACHE_WAYS=$ways out of range [1,16], using default 4"
4
else
ways
end
end
using CUDA

# Type definitions
Expand All @@ -31,7 +18,7 @@ include("types.jl")
# Dispatch methods (allocate_vector, wrap_array, get_typed_pool!)
include("dispatch.jl")

# GPU-specific get_view! implementation
# GPU-specific acquire (arr_wrappers + setfield!, _resize_to_fit!, _reshape_impl!)
include("acquire.jl")

# Task-local pool (multi-device aware)
Expand Down
389 changes: 256 additions & 133 deletions ext/AdaptiveArrayPoolsCUDAExt/acquire.jl

Large diffs are not rendered by default.

34 changes: 19 additions & 15 deletions ext/AdaptiveArrayPoolsCUDAExt/debug.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,19 @@
# Safety levels on CUDA differ from CPU:
# - Level 0: Zero overhead (all branches dead-code-eliminated)
# - Level 1: Poisoning (NaN/sentinel fill) + structural invalidation via
# _resize_without_shrink!(vec, 0) + N-way cache invalidation
# _resize_to_fit!(vec, 0) + arr_wrappers invalidation (setfield!(:dims, zeros))
# - Level 2: Poisoning + escape detection (_validate_pool_return for CuArrays)
# - Level 3: Full + borrow call-site registry + debug messages
#
# Key difference: CPU uses resize!(v, 0) at Level 1 to invalidate stale SubArrays.
# On CUDA, resize!(CuVector, 0) would free GPU memory, so we use
# _resize_without_shrink!(vec, 0) instead — sets dims to (0,) while preserving
# _resize_to_fit!(vec, 0) instead — sets dims to (0,) while preserving
# the GPU allocation (maxsize). Poisoning fills sentinel data before the shrink.
# arr_wrappers are invalidated by setting wrapper dims to zeros (matches CPU pattern).

using AdaptiveArrayPools: _safety_level, _validate_pool_return,
_set_pending_callsite!, _maybe_record_borrow!,
_invalidate_released_slots!,
_invalidate_released_slots!, _zero_dims_tuple,
_throw_pool_escape_error,
POOL_DEBUG, POOL_SAFETY_LV,
PoolRuntimeEscapeError
Expand Down Expand Up @@ -49,26 +50,29 @@ end
#
# Overrides the no-op fallback in base. On CUDA:
# - Level 0: no-op (base _rewind_typed_pool! gates with S >= 1, so never called)
# - Level 1+: poison released CuVectors + invalidate N-way view cache
# - NO resize!(cuv, 0) — would free GPU memory
# - Level 1+: poison released CuVectors + invalidate arr_wrappers
# - NO resize!(cuv, 0) — would free GPU memory; use _resize_to_fit! instead

@noinline function AdaptiveArrayPools._invalidate_released_slots!(
tp::CuTypedPool{T}, old_n_active::Int, S::Int
) where {T}
new_n = tp.n_active
# Poison released CuVectors + shrink logical length to 0
for i in (new_n + 1):old_n_active
# Poison released CuVectors with sentinel values
_cuda_poison_fill!(@inbounds tp.vectors[i])
# Shrink logical length to 0 (GPU memory preserved via _resize_without_shrink!).
# Shrink logical length to 0 (GPU memory preserved via _resize_to_fit!).
# Matches CPU behavior where resize!(vec, 0) invalidates SubArray references.
_resize_without_shrink!(@inbounds(tp.vectors[i]), 0)
# Invalidate N-way cache entries for released slots.
# After poisoning, cached views point at poisoned data — clear them so
# re-acquire creates fresh views instead of returning stale poisoned ones.
base = (i - 1) * CACHE_WAYS
for k in 1:CACHE_WAYS
@inbounds tp.views[base + k] = nothing
@inbounds tp.view_dims[base + k] = nothing
_resize_to_fit!(@inbounds(tp.vectors[i]), 0)
end
# Invalidate arr_wrappers for released slots (matches CPU pattern from src/state.jl)
for N_idx in 1:length(tp.arr_wrappers)
wrappers_for_N = @inbounds tp.arr_wrappers[N_idx]
wrappers_for_N === nothing && continue
wrappers = wrappers_for_N::Vector{Any}
for i in (new_n + 1):min(old_n_active, length(wrappers))
wrapper = @inbounds wrappers[i]
wrapper === nothing && continue
setfield!(wrapper::CuArray, :dims, _zero_dims_tuple(N_idx))
end
end
return nothing
Expand Down
4 changes: 1 addition & 3 deletions ext/AdaptiveArrayPoolsCUDAExt/state.jl
Original file line number Diff line number Diff line change
Expand Up @@ -316,9 +316,7 @@ CUDA.reclaim()
"""
function Base.empty!(tp::CuTypedPool)
empty!(tp.vectors)
empty!(tp.views)
empty!(tp.view_dims)
empty!(tp.next_way)
empty!(tp.arr_wrappers)
tp.n_active = 0
# Restore sentinel values
empty!(tp._checkpoint_n_active)
Expand Down
48 changes: 20 additions & 28 deletions ext/AdaptiveArrayPoolsCUDAExt/types.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,44 +3,38 @@
# ==============================================================================

# Note: Unlike CPU, view(CuVector, 1:n) returns CuVector (via GPUArrays derive()),
# NOT SubArray. However, we still cache view objects to avoid CPU heap allocation
# (~80 bytes per call) for the CuVector metadata wrapper.

# Note: Uses shared CACHE_WAYS constant from main module for consistency.
# NOT SubArray. GPU view/reshape creation allocates ~80 bytes on CPU heap for the
# CuArray wrapper. We cache wrappers via arr_wrappers to achieve zero-allocation
# on cache hit (same approach as CPU's setfield!-based Array wrapper reuse).

"""
CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}}

GPU memory pool for element type `T`. Uses unified N-way view caching for all dimensions.
GPU memory pool for element type `T`. Uses `arr_wrappers`-based CuArray reuse
for zero-allocation acquire (same design as CPU TypedPool on Julia 1.11+).

## Fields
- `vectors`: Backing `CuVector{T}` storage (one per slot)
- `views`: Flat N-way cache storing CuArray of any dimension
- Layout: `views[(slot-1)*CACHE_WAYS + way]` for way ∈ 1:CACHE_WAYS
- `view_dims`: Cached dims corresponding to views
- `next_way`: Round-robin counter per slot for cache replacement
- `arr_wrappers`: `Vector{Union{Nothing, Vector{Any}}}` — indexed by N (dimensionality),
each entry is a per-slot cached `CuArray{T,N}` wrapper. Uses `setfield!(wrapper, :dims, dims)`
for zero-allocation reuse of unlimited dimension patterns within the same N.
When the backing vector's GPU buffer changes (rare: only on grow beyond capacity),
the wrapper's `:data` field is updated via DataRef refcount management.
- State management fields (same as CPU)

## Design Note
Unlike CPU where view() returns SubArray and reshape() returns ReshapedArray,
CUDA returns CuArray for both operations. This allows a unified cache that
stores CuArray{T,N} for any N, eliminating the need for separate 1D/N-D caches.

GPU view/reshape creation allocates ~80 bytes on CPU heap for the CuArray
wrapper object. N-way caching with for-loop lookup eliminates this allocation
when the same dimensions pattern is requested again.
Unlike CPU where `setfield!(:ref, MemoryRef)` is free (GC-managed),
CuArray's `:data` field is `DataRef` with manual refcounting. We minimize this cost
via `wrapper.data.rc !== vec.data.rc` identity check (~2ns): only update `:data`
when the backing vector's GPU buffer actually changed. The common case (same buffer)
is a simple `setfield!(:dims)` — truly zero-allocation.
"""
mutable struct CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}}
# --- Storage ---
vectors::Vector{CuVector{T}}

# --- Unified N-Way View Cache (flat layout) ---
# Length = n_slots * CACHE_WAYS
views::Vector{Any} # CuArray{T,N} for any N
view_dims::Vector{Any} # NTuple{N,Int} or nothing

# --- Cache Replacement (round-robin per slot) ---
next_way::Vector{Int} # next_way[slot] ∈ 1:CACHE_WAYS
# --- N-D Wrapper Cache (setfield!-based reuse, matches CPU TypedPool) ---
arr_wrappers::Vector{Union{Nothing, Vector{Any}}} # index=N (dimensionality), value=per-slot CuArray{T,N}

# --- State Management (1-based sentinel pattern) ---
n_active::Int
Expand All @@ -50,11 +44,9 @@ end

function CuTypedPool{T}() where {T}
return CuTypedPool{T}(
CuVector{T}[], # vectors
Any[], # views (N-way flat cache)
Any[], # view_dims
Int[], # next_way (round-robin counters)
0, [0], [0] # State (1-based sentinel)
CuVector{T}[], # vectors
Union{Nothing, Vector{Any}}[], # arr_wrappers (indexed by N)
0, [0], [0] # State (1-based sentinel)
)
end

Expand Down
13 changes: 12 additions & 1 deletion src/legacy/state.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@
# Identical to v1.11+ state.jl except empty! clears legacy N-way cache fields
# (nd_arrays, nd_dims, nd_ptrs, nd_next_way) instead of nd_wrappers.

# Zero-dims tuple for wrapper invalidation. Literal tuples for N ≤ 4 avoid
# ntuple(_ -> 0, N) dynamic-dispatch allocation (runtime N → heterogeneous
# return type → boxing). Falls back to ntuple for N > 4 (extremely rare).
@inline function _zero_dims_tuple(N::Int)
N == 1 && return (0,)
N == 2 && return (0, 0)
N == 3 && return (0, 0, 0)
N == 4 && return (0, 0, 0, 0)
return ntuple(_ -> 0, N)
end

# ==============================================================================
# State Management - checkpoint!
# ==============================================================================
Expand Down Expand Up @@ -300,7 +311,7 @@ end
dims === nothing && continue
N = length(dims::Tuple)
setfield!(ba::BitArray, :len, 0)
setfield!(ba::BitArray, :dims, ntuple(_ -> 0, N))
setfield!(ba::BitArray, :dims, _zero_dims_tuple(N))
end
end
return nothing
Expand Down
15 changes: 13 additions & 2 deletions src/state.jl
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,17 @@ end
_invalidate_released_slots!(::AbstractTypedPool, ::Int, ::Int) = nothing
_invalidate_released_slots!(::AbstractTypedPool, ::Int) = nothing # legacy 2-arg compat

# Zero-dims tuple for wrapper invalidation. Literal tuples for N ≤ 4 avoid
# ntuple(_ -> 0, N) dynamic-dispatch allocation (runtime N → heterogeneous
# return type → boxing). Falls back to ntuple for N > 4 (extremely rare).
@inline function _zero_dims_tuple(N::Int)
N == 1 && return (0,)
N == 2 && return (0, 0)
N == 3 && return (0, 0, 0)
N == 4 && return (0, 0, 0, 0)
return ntuple(_ -> 0, N)
end

@noinline function _invalidate_released_slots!(tp::TypedPool{T}, old_n_active::Int, S::Int = POOL_SAFETY_LV[]) where {T}
new_n = tp.n_active
# Level 2+: poison vectors with NaN/sentinel before structural invalidation
Expand All @@ -262,7 +273,7 @@ _invalidate_released_slots!(::AbstractTypedPool, ::Int) = nothing # legacy 2-ar
for i in (new_n + 1):min(old_n_active, length(wrappers))
wrapper = @inbounds wrappers[i]
wrapper === nothing && continue
setfield!(wrapper::Array, :size, ntuple(_ -> 0, N_idx))
setfield!(wrapper::Array, :size, _zero_dims_tuple(N_idx))
end
end
return nothing
Expand All @@ -288,7 +299,7 @@ end
wrapper === nothing && continue
ba = wrapper::BitArray
setfield!(ba, :len, 0)
setfield!(ba, :dims, ntuple(_ -> 0, N_idx))
setfield!(ba, :dims, _zero_dims_tuple(N_idx))
end
end
return nothing
Expand Down
41 changes: 28 additions & 13 deletions test/cuda/test_allocation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -169,53 +169,68 @@

end

@testset "_resize_without_shrink! GPU memory preservation" begin
_resize_without_shrink! = ext._resize_without_shrink!
@testset "_resize_to_fit! GPU memory preservation" begin
_resize_to_fit! = ext._resize_to_fit!

@testset "Shrink preserves GPU pointer" begin
v = CUDA.zeros(Float32, 1000)
ptr = UInt(pointer(v))
_resize_without_shrink!(v, 100)
_resize_to_fit!(v, 100)
@test length(v) == 100
@test UInt(pointer(v)) == ptr
end

@testset "Grow-back within maxsize: no realloc" begin
@testset "Grow-back within capacity: no realloc" begin
v = CUDA.zeros(Float32, 1000)
ptr = UInt(pointer(v))
# Shrink first
_resize_without_shrink!(v, 100)
_resize_to_fit!(v, 100)
@test length(v) == 100
@test UInt(pointer(v)) == ptr
# Grow back to original size — maxsize preserved, so no GPU realloc
_resize_without_shrink!(v, 1000)
_resize_to_fit!(v, 1000)
@test length(v) == 1000
@test UInt(pointer(v)) == ptr
end

@testset "Shrink to 0 preserves pointer" begin
@testset "Shrink to 0, grow back preserves pointer" begin
v = CUDA.zeros(Float32, 500)
ptr = UInt(pointer(v))
_resize_without_shrink!(v, 0)
_resize_to_fit!(v, 0)
@test length(v) == 0
# GPU memory still allocated (not freed)
# Grow back from 0
_resize_without_shrink!(v, 500)
# Grow back from 0 — within capacity, so no GPU realloc
_resize_to_fit!(v, 500)
@test length(v) == 500
@test UInt(pointer(v)) == ptr
end

@testset "Grow within capacity after invalidation: no realloc" begin
# This is the key test: after safety invalidation (dims→0),
# re-acquire within original capacity should NOT trigger GPU realloc.
# (CUDA.jl v5.9.x resize! would always reallocate; _resize_to_fit! avoids this)
v = CUDA.zeros(Float32, 1000)
ptr = UInt(pointer(v))
# Simulate safety invalidation
_resize_to_fit!(v, 0)
@test length(v) == 0
# Re-acquire at smaller size (still within original capacity)
_resize_to_fit!(v, 200)
@test length(v) == 200
@test UInt(pointer(v)) == ptr # Same GPU buffer
end

@testset "No-op when n == length" begin
v = CUDA.zeros(Float32, 200)
ptr = UInt(pointer(v))
_resize_without_shrink!(v, 200)
_resize_to_fit!(v, 200)
@test length(v) == 200
@test UInt(pointer(v)) == ptr
end

@testset "Grow beyond maxsize delegates to resize!" begin
@testset "Grow beyond capacity delegates to resize!" begin
v = CUDA.zeros(Float32, 100)
_resize_without_shrink!(v, 10_000)
_resize_to_fit!(v, 10_000)
@test length(v) == 10_000
# Pointer may change (new allocation) — just verify length is correct
end
Expand Down
17 changes: 11 additions & 6 deletions test/cuda/test_cuda_safety.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ _cuda_test_leak(x) = x
# ==============================================================================
# Level 1: Poisoning + structural invalidation (length → 0)
# ==============================================================================
# CUDA Level 1 now: poison fill → _resize_without_shrink!(vec, 0)
# CUDA Level 1 now: poison fill → _resize_to_fit!(vec, 0) + arr_wrappers invalidation
# Backing vector length becomes 0 (GPU memory preserved via maxsize).
# Poison data persists in GPU memory and is visible on re-acquire (grow-back).

Expand Down Expand Up @@ -141,17 +141,22 @@ _cuda_test_leak(x) = x
rewind!(pool)
end

@testset "Level 1: N-way cache invalidated on poisoned rewind" begin
@testset "Level 1: arr_wrappers invalidated on poisoned rewind" begin
pool = _make_cuda_pool(1)
checkpoint!(pool)
v = acquire!(pool, Float32, 10)
CUDA.fill!(v, 1.0f0)
rewind!(pool)

# Cached views should be cleared (nothing) after poisoning
base = 0 * ext.CACHE_WAYS
for k in 1:ext.CACHE_WAYS
@test pool.float32.views[base + k] === nothing
# arr_wrappers for released slots should have zero-dims after invalidation
tp = pool.float32
for N_idx in 1:length(tp.arr_wrappers)
wrappers_for_N = tp.arr_wrappers[N_idx]
wrappers_for_N === nothing && continue
for wrapper in wrappers_for_N
wrapper === nothing && continue
@test all(==(0), size(wrapper))
end
end
end

Expand Down
6 changes: 2 additions & 4 deletions test/cuda/test_extension.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
tp_fields = fieldnames(CuTypedPool)
@test :vectors in tp_fields
@test :n_active in tp_fields
# N-way cache fields
@test :views in tp_fields
@test :view_dims in tp_fields
@test :next_way in tp_fields # Round-robin counter
# arr_wrappers (setfield!-based wrapper reuse, replaces N-way cache)
@test :arr_wrappers in tp_fields
# State management
@test :_checkpoint_n_active in tp_fields
@test :_checkpoint_depths in tp_fields
Expand Down
Loading
Loading