diff --git a/docs/make.jl b/docs/make.jl index feb85de7..279ca48d 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -38,6 +38,34 @@ const README_PATH_MAPPINGS = [ (r"\(docs/safety\.md(#[^)]+)?\)", s"(basics/safety-rules.md\1)"), ] +""" +Inject Google Search Console verification meta tag into generated HTML files. +This is enabled only when `ENV["GOOGLE_SITE_VERIFICATION"]` is set. +""" +function inject_google_site_verification!(build_dir::String) + token = strip(get(ENV, "GOOGLE_SITE_VERIFICATION", "")) + isempty(token) && return + + safe_token = replace(token, '"' => """) + meta_tag = "" + injected = 0 + + for (root, _, files) in walkdir(build_dir) + for file in files + endswith(file, ".html") || continue + path = joinpath(root, file) + html = read(path, String) + occursin("google-site-verification", html) && continue + occursin("", html) || continue + + write_if_changed(path, replace(html, "" => "$(meta_tag)\n"; count = 1)) + injected += 1 + end + end + + return @info "Injected google-site-verification meta tag" files = injected build_dir = build_dir +end + """ Rewrite relative paths in README.md for Documenter structure. @@ -74,9 +102,15 @@ makedocs( sitename = "AdaptiveArrayPools.jl", authors = "Min-Gu Yoo", modules = [AdaptiveArrayPools], + # servedocs() sets root to docs/ which conflicts with project-root remotes. + # Enable GitHub source links only in CI where makedocs root matches git root. + remotes = get(ENV, "CI", nothing) == "true" ? + Dict(dirname(@__DIR__) => (Documenter.Remotes.GitHub("ProjectTorreyPines", "AdaptiveArrayPools.jl"), "master")) : + nothing, format = Documenter.HTML( prettyurls = get(ENV, "CI", nothing) == "true", canonical = "https://projecttorreypines.github.io/AdaptiveArrayPools.jl", + edit_link = :commit, assets = String[], ), pages = [ @@ -92,6 +126,7 @@ makedocs( "Multi-threading" => "features/multi-threading.md", ], "Features" => [ + "Pool Safety" => "features/safety.md", "`@maybe_with_pool`" => "features/maybe-with-pool.md", "Bit Arrays" => "features/bit-arrays.md", "CUDA Support" => "features/cuda-support.md", @@ -112,6 +147,8 @@ makedocs( warnonly = [:cross_references, :missing_docs], ) +inject_google_site_verification!(joinpath(@__DIR__, "build")) + deploydocs( repo = "github.com/ProjectTorreyPines/AdaptiveArrayPools.jl.git", devbranch = "master", diff --git a/docs/src/features/safety.md b/docs/src/features/safety.md new file mode 100644 index 00000000..72b387b3 --- /dev/null +++ b/docs/src/features/safety.md @@ -0,0 +1,131 @@ +# Pool Safety + +AdaptiveArrayPools catches pool-escape bugs at **two levels**: compile-time (macro analysis) and runtime (configurable safety levels). + +## Compile-Time Detection + +The `@with_pool` macro statically analyzes your code and **rejects** any expression that would return a pool-backed array. This catches the most common mistakes at zero runtime cost. + +```julia +# Direct array escape — caught at macro expansion time +@with_pool pool begin + v = acquire!(pool, Float64, 100) + v # ← ERROR: v escapes the pool scope +end +``` + +This would throw an error message as follows: +``` +ERROR: LoadError: PoolEscapeError (compile-time) + + The following variable escapes the @with_pool scope: + + v ← pool-acquired view + + Declarations: + [1] v = acquire!(pool, Float64, 100) [myfile.jl:2] + + Escaping return: + [1] v [myfile.jl:3] + + Fix: Use collect(v) to return owned copies. + Or use a regular Julia array (zeros()/Array{T}()) if it must outlive the pool scope. + +in expression starting at myfile.jl:1 +``` + +The analyzer tracks aliases, containers, and convenience wrappers: + +```julia +# All of these are caught at compile time: +@with_pool pool begin + v = zeros!(pool, Float64, 10) + w = v # alias of pool variable + t = (1, v) # tuple wrapping pool array + w # ← ERROR +end + +@with_pool pool function bad() + A = acquire!(pool, Float64, 3, 3) + return A # ← ERROR (explicit return) +end +``` + +Safe patterns pass without error: + +```julia +@with_pool pool begin + v = acquire!(pool, Float64, 100) + sum(v) # ✅ scalar result +end + +@with_pool pool begin + v = acquire!(pool, Float64, 100) + collect(v) # ✅ owned copy +end +``` + +## Runtime Safety Levels + +For bugs the compiler can't catch (e.g., values hidden behind opaque function calls), runtime safety provides configurable protection via the type parameter `S` in `AdaptiveArrayPool{S}`. + +### Level Overview + +| Level | Name | CPU | CUDA | Overhead | +|-------|------|-----|------|----------| +| **0** | off | No-op (all branches dead-code-eliminated) | Same | Zero | +| **1** | guard | `resize!(v,0)` + `setfield!` invalidation | NaN/sentinel poisoning + cache clear | ~5ns/slot | +| **2** | full | Level 1 + data poisoning + escape detection at scope exit | Level 1 + device-pointer overlap check | Moderate | +| **3** | debug | Level 2 + acquire call-site tracking | Same | Moderate+ | + +### Why CPU and CUDA Differ at Level 1 + +Both achieve the same goal — **make stale references fail loudly** — but use different mechanisms: + +| | CPU | CUDA | +|---|-----|------| +| **Strategy** | Structural invalidation | Data poisoning | +| **Mechanism** | `resize!(v, 0)` shrinks backing vector to length 0; `setfield!(:size, (0,))` zeroes the array dimensions | `CUDA.fill!(v, NaN)` / `typemax` / `true` fills backing CuVector with sentinel values | +| **Stale access result** | `BoundsError` (array has length 0) | Reads `NaN` or `typemax` (obviously wrong data) | +| **Why not the other way?** | CPU `resize!` is cheap (~0 cost) | CUDA `resize!` calls `CUDA.Mem.free()` — destroys the pooled VRAM allocation | +| **Cache invalidation** | View length/dims zeroed | N-way view cache entries cleared to `nothing` | + +### Setting the Level + +```julia +using AdaptiveArrayPools + +# Enable full safety on CPU + all GPU devices (preserves cached arrays, zero-copy) +set_safety_level!(2) + +# Back to zero overhead everywhere +set_safety_level!(0) +``` + +The pool type parameter `S` is a compile-time constant. At `S=0`, the JIT eliminates all safety branches via dead-code elimination — true zero overhead with no `Ref` reads or conditional branches. + +### Data Poisoning (Level 2+, CPU) + +At Level 1, CPU relies on **structural invalidation** (`resize!` + `setfield!`) which makes stale views throw `BoundsError`. At Level 2+, CPU additionally **poisons** the backing vector data with sentinel values (`NaN`, `typemax`, all-`true` for `BitVector`) *before* structural invalidation. This catches stale access through `unsafe_acquire!` wrappers on Julia 1.10 where `setfield!` on Array is unavailable. + +CUDA already poisons at Level 1 (its primary invalidation strategy), so no additional poisoning step is needed at Level 2. + +### Escape Detection (Level 2+) + +At every `@with_pool` scope exit, the return value is inspected for overlap with pool-backed memory. Recursively checks `Tuple`, `NamedTuple`, `Dict`, `Pair`, `Set`, and `AbstractArray` elements. + +Level 3 additionally records each `acquire!` call-site, so the error message pinpoints the exact source line and expression that allocated the escaping array. + +### Legacy: `POOL_DEBUG` + +`POOL_DEBUG[] = true` triggers Level 2 escape detection regardless of `S`. For new code, prefer `set_safety_level!(2)`. + +## Recommended Workflow + +```julia +# Development / Testing: catch bugs early +set_safety_level!(2) # or 3 for call-site info in error messages + +# Production: zero overhead +set_safety_level!(0) # all safety branches eliminated by the compiler +``` diff --git a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl index bef3f3a3..8abb3672 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl @@ -40,6 +40,9 @@ include("task_local_pool.jl") # State management (checkpoint!, rewind!, reset!, empty!) include("state.jl") +# Safety: poisoning, escape detection, borrow tracking +include("debug.jl") + # Display & statistics (pool_stats, show) include("utils.jl") diff --git a/ext/AdaptiveArrayPoolsCUDAExt/debug.jl b/ext/AdaptiveArrayPoolsCUDAExt/debug.jl new file mode 100644 index 00000000..2ac371f8 --- /dev/null +++ b/ext/AdaptiveArrayPoolsCUDAExt/debug.jl @@ -0,0 +1,256 @@ +# ============================================================================== +# CUDA Safety: Poisoning, Escape Detection, Borrow Tracking +# ============================================================================== +# CUDA-specific safety implementations for CuAdaptiveArrayPool{S}. +# +# Safety levels on CUDA differ from CPU: +# - Level 0: Zero overhead (all branches dead-code-eliminated) +# - Level 1: Poisoning (NaN/sentinel fill) + N-way cache invalidation +# (CUDA equivalent of CPU's resize!/setfield! structural invalidation) +# - Level 2: Poisoning + escape detection (_validate_pool_return for CuArrays) +# - Level 3: Full + borrow call-site registry + debug messages +# +# Key difference: CPU uses resize!(v, 0) at Level 1 to invalidate stale SubArrays. +# On CUDA, resize!(CuVector, 0) frees GPU memory, so we use poisoning instead. + +using AdaptiveArrayPools: _safety_level, _validate_pool_return, + _set_pending_callsite!, _maybe_record_borrow!, + _invalidate_released_slots!, + _throw_pool_escape_error, + POOL_DEBUG, POOL_SAFETY_LV, + PoolRuntimeEscapeError + +# ============================================================================== +# Poisoning: Fill released CuVectors with sentinel values (Level 1+) +# ============================================================================== + +_cuda_poison_value(::Type{T}) where {T <: AbstractFloat} = T(NaN) +_cuda_poison_value(::Type{T}) where {T <: Integer} = typemax(T) +_cuda_poison_value(::Type{Complex{T}}) where {T} = Complex{T}(_cuda_poison_value(T), _cuda_poison_value(T)) +_cuda_poison_value(::Type{Bool}) = true +_cuda_poison_value(::Type{T}) where {T} = zero(T) # generic fallback + +""" + _cuda_poison_fill!(v::CuVector{T}) + +Fill a CuVector with a detectable sentinel value (NaN for floats, typemax for ints). +@noinline to avoid inlining GPU kernel launch overhead into hot rewind paths. +""" +@noinline function _cuda_poison_fill!(v::CuVector{T}) where {T} + length(v) > 0 && CUDA.fill!(v, _cuda_poison_value(T)) + return nothing +end + +# ============================================================================== +# _invalidate_released_slots! for CuTypedPool (Level 1+) +# ============================================================================== +# +# Overrides the no-op fallback in base. On CUDA: +# - Level 0: no-op (base _rewind_typed_pool! gates with S >= 1, so never called) +# - Level 1+: poison released CuVectors + invalidate N-way view cache +# - NO resize!(cuv, 0) — would free GPU memory + +@noinline function AdaptiveArrayPools._invalidate_released_slots!( + tp::CuTypedPool{T}, old_n_active::Int, S::Int + ) where {T} + new_n = tp.n_active + for i in (new_n + 1):old_n_active + # Poison released CuVectors with sentinel values + _cuda_poison_fill!(@inbounds tp.vectors[i]) + # Invalidate N-way cache entries for released slots. + # After poisoning, cached views point at poisoned data — clear them so + # re-acquire creates fresh views instead of returning stale poisoned ones. + base = (i - 1) * CACHE_WAYS + for k in 1:CACHE_WAYS + @inbounds tp.views[base + k] = nothing + @inbounds tp.view_dims[base + k] = nothing + end + end + return nothing +end + +# ============================================================================== +# Borrow Tracking: Call-site recording (Level 3) +# ============================================================================== +# +# Overrides the no-op AbstractArrayPool fallbacks. +# The macro injects pool._pending_callsite = "file:line\nexpr" before acquire calls. +# These functions flush that pending info into the borrow log. + +"""Record pending callsite for borrow tracking (compiles to no-op when S < 3).""" +@inline function AdaptiveArrayPools._set_pending_callsite!(pool::CuAdaptiveArrayPool{S}, msg::String) where {S} + S >= 3 && isempty(pool._pending_callsite) && (pool._pending_callsite = msg) + return nothing +end + +"""Flush pending callsite into borrow log (compiles to no-op when S < 3).""" +@inline function AdaptiveArrayPools._maybe_record_borrow!(pool::CuAdaptiveArrayPool{S}, tp::AbstractTypedPool) where {S} + S >= 3 && _cuda_record_borrow_from_pending!(pool, tp) + return nothing +end + +@noinline function _cuda_record_borrow_from_pending!(pool::CuAdaptiveArrayPool, tp::AbstractTypedPool) + callsite = pool._pending_callsite + isempty(callsite) && return nothing + log = pool._borrow_log + if log === nothing + log = IdDict{Any, String}() + pool._borrow_log = log + end + @inbounds log[tp.vectors[tp.n_active]] = callsite + pool._pending_callsite = "" # Clear for next acquire + return nothing +end + +@noinline function _cuda_lookup_borrow_callsite(pool::CuAdaptiveArrayPool, v)::Union{Nothing, String} + log = pool._borrow_log + log === nothing && return nothing + return get(log, v, nothing) +end + +# ============================================================================== +# Escape Detection: _validate_pool_return for CuArrays (Level 2+) +# ============================================================================== +# +# CuArray views share the same device buffer, so device pointer overlap +# detection works correctly. pointer(::CuArray) returns CuPtr{T}. + +function AdaptiveArrayPools._validate_pool_return(val, pool::CuAdaptiveArrayPool{S}) where {S} + (S >= 2 || POOL_DEBUG[]) || return nothing + _validate_cuda_return(val, pool) + return nothing +end + +function _validate_cuda_return(val, pool::CuAdaptiveArrayPool) + # Note: Container recursion (Tuple, NamedTuple, Pair, Dict, Set, AbstractArray) + # is duplicated from CPU's _validate_pool_return dispatch chain (src/debug.jl). + # CPU uses multiple dispatch on pool::AdaptiveArrayPool for each container type, + # which doesn't cover CuAdaptiveArrayPool. We could add CuAdaptiveArrayPool methods + # for each container, but that creates 6+ method definitions vs. this single function. + # Trade-off: if a new container type is added to the CPU path, it must also be added here. + + # CuArray (CuVector, CuMatrix, etc.) + if val isa CuArray + _check_cuda_pointer_overlap(val, pool) + return + end + + # SubArray / ReshapedArray of CuArray — defensive code. + # Current CUDA.jl: view(CuVector, 1:n) returns CuArray via GPUArrays derive(), + # NOT SubArray. These branches guard against future CUDA.jl behavior changes + # or user-constructed SubArray{T,N,CuArray} / ReshapedArray wrappers. + if val isa SubArray + p = parent(val) + if p isa CuArray + _check_cuda_pointer_overlap(p, pool, val) + end + return + end + + if val isa Base.ReshapedArray + p = parent(val) + if p isa CuArray + _check_cuda_pointer_overlap(p, pool, val) + elseif p isa SubArray + pp = parent(p) + if pp isa CuArray + _check_cuda_pointer_overlap(pp, pool, val) + end + end + return + end + + # Tuple + if val isa Tuple + for x in val + _validate_cuda_return(x, pool) + end + return + end + + # NamedTuple + if val isa NamedTuple + for x in values(val) + _validate_cuda_return(x, pool) + end + return + end + + # Pair + if val isa Pair + _validate_cuda_return(val.first, pool) + _validate_cuda_return(val.second, pool) + return + end + + # AbstractDict + if val isa AbstractDict + for p in val + _validate_cuda_return(p, pool) + end + return + end + + # AbstractSet + if val isa AbstractSet + for x in val + _validate_cuda_return(x, pool) + end + return + end + + # Array of CuArrays (element recursion for containers) + if val isa AbstractArray + ET = eltype(val) + if !(ET <: Number) && !(ET <: AbstractString) && ET !== Symbol && ET !== Char + for x in val + _validate_cuda_return(x, pool) + end + end + end + + return +end + +""" + _check_cuda_pointer_overlap(arr::CuArray, pool, original_val=arr) + +Check if a CuArray's device memory overlaps with any pool backing CuVector. +Throws `PoolRuntimeEscapeError` on overlap. +""" +function _check_cuda_pointer_overlap(arr::CuArray, pool::CuAdaptiveArrayPool, original_val = arr) + arr_ptr = UInt(pointer(arr)) + arr_bytes = length(arr) * sizeof(eltype(arr)) + arr_end = arr_ptr + arr_bytes + + return_site = let rs = pool._pending_return_site + isempty(rs) ? nothing : rs + end + + # Check fixed slots + AdaptiveArrayPools.foreach_fixed_slot(pool) do tp + _check_tp_cuda_overlap(tp, arr_ptr, arr_end, pool, return_site, original_val) + end + + # Check others + for tp in values(pool.others) + _check_tp_cuda_overlap(tp, arr_ptr, arr_end, pool, return_site, original_val) + end + return +end + +@noinline function _check_tp_cuda_overlap( + tp::AbstractTypedPool, arr_ptr::UInt, arr_end::UInt, + pool::CuAdaptiveArrayPool, return_site, original_val + ) + for v in tp.vectors + v_ptr = UInt(pointer(v)) + v_bytes = length(v) * sizeof(eltype(v)) + v_end = v_ptr + v_bytes + if !(arr_end <= v_ptr || v_end <= arr_ptr) + callsite = _cuda_lookup_borrow_callsite(pool, v) + _throw_pool_escape_error(original_val, eltype(v), callsite, return_site) + end + end + return +end diff --git a/ext/AdaptiveArrayPoolsCUDAExt/macros.jl b/ext/AdaptiveArrayPoolsCUDAExt/macros.jl index 54384a1a..62796ec9 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/macros.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/macros.jl @@ -3,7 +3,7 @@ # ============================================================================== # Enables @with_pool :cuda syntax for GPU memory pooling. -using AdaptiveArrayPools: _get_pool_for_backend +using AdaptiveArrayPools: _get_pool_for_backend, _dispatch_pool_scope # ============================================================================== # Backend Registration (Val dispatch - zero overhead) @@ -14,3 +14,23 @@ Register :cuda backend for `@with_pool :cuda` syntax. Uses Val dispatch for compile-time resolution and full inlining. """ @inline AdaptiveArrayPools._get_pool_for_backend(::Val{:cuda}) = get_task_local_cuda_pool() + +# ============================================================================== +# Union Splitting for CuAdaptiveArrayPool{S} +# ============================================================================== +# +# The base _dispatch_pool_scope has an `else` fallback for non-CPU pools that +# passes pool_any without type narrowing. This override provides union splitting +# for CUDA pools, enabling compile-time S → dead-code elimination of safety branches. + +@inline function AdaptiveArrayPools._dispatch_pool_scope(f, pool_any::CuAdaptiveArrayPool) + if pool_any isa CuAdaptiveArrayPool{0} + return f(pool_any::CuAdaptiveArrayPool{0}) + elseif pool_any isa CuAdaptiveArrayPool{1} + return f(pool_any::CuAdaptiveArrayPool{1}) + elseif pool_any isa CuAdaptiveArrayPool{2} + return f(pool_any::CuAdaptiveArrayPool{2}) + else + return f(pool_any::CuAdaptiveArrayPool{3}) + end +end diff --git a/ext/AdaptiveArrayPoolsCUDAExt/state.jl b/ext/AdaptiveArrayPoolsCUDAExt/state.jl index 46cd616c..80a28ee2 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/state.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/state.jl @@ -1,9 +1,10 @@ # ============================================================================== # State Management for CUDA Pools # ============================================================================== -# checkpoint!, rewind!, reset!, empty! implementations for CuAdaptiveArrayPool. +# checkpoint!, rewind!, reset!, empty! implementations for CuAdaptiveArrayPool{S}. # Note: _checkpoint_typed_pool! and _rewind_typed_pool! already work with # AbstractTypedPool, so they work for CuTypedPool automatically. +# S parameter is threaded through rewind paths for compile-time safety dispatch. using AdaptiveArrayPools: checkpoint!, rewind!, reset!, _checkpoint_typed_pool!, _rewind_typed_pool!, _has_bit, @@ -18,7 +19,7 @@ using AdaptiveArrayPools: checkpoint!, rewind!, reset!, Apply `f` to each fixed slot CuTypedPool. Zero allocation via compile-time unrolling. """ -@generated function AdaptiveArrayPools.foreach_fixed_slot(f::F, pool::CuAdaptiveArrayPool) where {F} +@generated function AdaptiveArrayPools.foreach_fixed_slot(f::F, pool::CuAdaptiveArrayPool{S}) where {F, S} exprs = [:(f(getfield(pool, $(QuoteNode(field))))) for field in GPU_FIXED_SLOT_FIELDS] return quote Base.@_inline_meta @@ -61,7 +62,7 @@ end end # Type-specific checkpoint (multiple types) -@generated function AdaptiveArrayPools.checkpoint!(pool::CuAdaptiveArrayPool, types::Type...) +@generated function AdaptiveArrayPools.checkpoint!(pool::CuAdaptiveArrayPool{S}, types::Type...) where {S} seen = Set{Any}() unique_indices = Int[] for i in eachindex(types) @@ -85,7 +86,7 @@ end # rewind! for CuAdaptiveArrayPool # ============================================================================== -function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool) +function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool{S}) where {S} cur_depth = pool._current_depth # Safety guard: at global scope (depth=1), delegate to reset! @@ -94,14 +95,14 @@ function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool) return nothing end - # Fixed slots + # Fixed slots — pass S for compile-time safety dispatch AdaptiveArrayPools.foreach_fixed_slot(pool) do tp - _rewind_typed_pool!(tp, cur_depth) + _rewind_typed_pool!(tp, cur_depth, S) end # Others for tp in values(pool.others) - _rewind_typed_pool!(tp, cur_depth) + _rewind_typed_pool!(tp, cur_depth, S) end pop!(pool._touched_type_masks) @@ -112,12 +113,12 @@ function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool) end # Type-specific rewind (single type) -@inline function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool, ::Type{T}) where {T} +@inline function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool{S}, ::Type{T}) where {S, T} if pool._current_depth == 1 reset!(AdaptiveArrayPools.get_typed_pool!(pool, T)) return nothing end - _rewind_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, T), pool._current_depth) + _rewind_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, T), pool._current_depth, S) pop!(pool._touched_type_masks) pop!(pool._touched_has_others) pool._current_depth -= 1 @@ -125,7 +126,7 @@ end end # Type-specific rewind (multiple types) -@generated function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool, types::Type...) +@generated function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool{S}, types::Type...) where {S} seen = Set{Any}() unique_indices = Int[] for i in eachindex(types) @@ -134,7 +135,7 @@ end push!(unique_indices, i) end end - rewind_exprs = [:(_rewind_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]), pool._current_depth)) for i in reverse(unique_indices)] + rewind_exprs = [:(_rewind_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]), pool._current_depth, S)) for i in reverse(unique_indices)] reset_exprs = [:(reset!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]))) for i in unique_indices] return quote if pool._current_depth == 1 @@ -178,21 +179,21 @@ end return nothing end -@inline function AdaptiveArrayPools._lazy_rewind!(pool::CuAdaptiveArrayPool) +@inline function AdaptiveArrayPools._lazy_rewind!(pool::CuAdaptiveArrayPool{S}) where {S} d = pool._current_depth mask = @inbounds(pool._touched_type_masks[d]) & _TYPE_BITS_MASK - _has_bit(mask, Float64) && _rewind_typed_pool!(pool.float64, d) - _has_bit(mask, Float32) && _rewind_typed_pool!(pool.float32, d) - _has_bit(mask, Int64) && _rewind_typed_pool!(pool.int64, d) - _has_bit(mask, Int32) && _rewind_typed_pool!(pool.int32, d) - _has_bit(mask, ComplexF64) && _rewind_typed_pool!(pool.complexf64, d) - _has_bit(mask, ComplexF32) && _rewind_typed_pool!(pool.complexf32, d) - _has_bit(mask, Bool) && _rewind_typed_pool!(pool.bool, d) + _has_bit(mask, Float64) && _rewind_typed_pool!(pool.float64, d, S) + _has_bit(mask, Float32) && _rewind_typed_pool!(pool.float32, d, S) + _has_bit(mask, Int64) && _rewind_typed_pool!(pool.int64, d, S) + _has_bit(mask, Int32) && _rewind_typed_pool!(pool.int32, d, S) + _has_bit(mask, ComplexF64) && _rewind_typed_pool!(pool.complexf64, d, S) + _has_bit(mask, ComplexF32) && _rewind_typed_pool!(pool.complexf32, d, S) + _has_bit(mask, Bool) && _rewind_typed_pool!(pool.bool, d, S) # Bit 7: Float16 (CUDA reassignment — _fixed_slot_bit(Float16)==0, must use explicit bit check) - mask & _cuda_float16_bit() != 0 && _rewind_typed_pool!(pool.float16, d) + mask & _cuda_float16_bit() != 0 && _rewind_typed_pool!(pool.float16, d, S) if @inbounds(pool._touched_has_others[d]) for tp in values(pool.others) - _rewind_typed_pool!(tp, d) + _rewind_typed_pool!(tp, d, S) end end pop!(pool._touched_type_masks) @@ -228,17 +229,17 @@ end # Uses direct field access with bit checks — foreach_fixed_slot is single-argument (no bit yield). # Bit 7: Float16 (CUDA-specific; lazy-checkpointed on first touch by _record_type_touch!). # has_others: genuine others types (UInt8, Int8, etc.) — eagerly checkpointed at scope entry. -@inline function AdaptiveArrayPools._typed_lazy_rewind!(pool::CuAdaptiveArrayPool, tracked_mask::UInt16) +@inline function AdaptiveArrayPools._typed_lazy_rewind!(pool::CuAdaptiveArrayPool{S}, tracked_mask::UInt16) where {S} d = pool._current_depth touched = @inbounds(pool._touched_type_masks[d]) & _TYPE_BITS_MASK combined = tracked_mask | touched - _has_bit(combined, Float64) && _rewind_typed_pool!(pool.float64, d) - _has_bit(combined, Float32) && _rewind_typed_pool!(pool.float32, d) - _has_bit(combined, Int64) && _rewind_typed_pool!(pool.int64, d) - _has_bit(combined, Int32) && _rewind_typed_pool!(pool.int32, d) - _has_bit(combined, ComplexF64) && _rewind_typed_pool!(pool.complexf64, d) - _has_bit(combined, ComplexF32) && _rewind_typed_pool!(pool.complexf32, d) - _has_bit(combined, Bool) && _rewind_typed_pool!(pool.bool, d) + _has_bit(combined, Float64) && _rewind_typed_pool!(pool.float64, d, S) + _has_bit(combined, Float32) && _rewind_typed_pool!(pool.float32, d, S) + _has_bit(combined, Int64) && _rewind_typed_pool!(pool.int64, d, S) + _has_bit(combined, Int32) && _rewind_typed_pool!(pool.int32, d, S) + _has_bit(combined, ComplexF64) && _rewind_typed_pool!(pool.complexf64, d, S) + _has_bit(combined, ComplexF32) && _rewind_typed_pool!(pool.complexf32, d, S) + _has_bit(combined, Bool) && _rewind_typed_pool!(pool.bool, d, S) # Float16: bit 7 is set by _record_type_touch! on first touch (lazy first-touch). # Also rewind when Float16 was a *tracked* type in the macro: _typed_lazy_checkpoint! # calls checkpoint!(pool, Float16) which pushes a checkpoint at depth d, but _acquire_impl! @@ -247,11 +248,11 @@ end # tracked_mask carries no bit for Float16 either. # Solution: check _checkpoint_depths to detect "Float16 was checkpointed at this depth". if combined & _cuda_float16_bit() != 0 || @inbounds(pool.float16._checkpoint_depths[end]) == d - _rewind_typed_pool!(pool.float16, d) + _rewind_typed_pool!(pool.float16, d, S) end if @inbounds(pool._touched_has_others[d]) for tp in values(pool.others) - _rewind_typed_pool!(tp, d) + _rewind_typed_pool!(tp, d, S) end end pop!(pool._touched_type_masks) @@ -282,6 +283,11 @@ function AdaptiveArrayPools.reset!(pool::CuAdaptiveArrayPool) empty!(pool._touched_has_others) push!(pool._touched_has_others, false) # Sentinel: no others + # Reset borrow tracking state + pool._pending_callsite = "" + pool._pending_return_site = "" + pool._borrow_log = nothing + return pool end @@ -341,5 +347,10 @@ function Base.empty!(pool::CuAdaptiveArrayPool) empty!(pool._touched_has_others) push!(pool._touched_has_others, false) # Sentinel: no others + # Reset borrow tracking state + pool._pending_callsite = "" + pool._pending_return_site = "" + pool._borrow_log = nothing + return pool end diff --git a/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl b/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl index 60da07f6..4ab507dd 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl @@ -2,11 +2,12 @@ # Task-Local CUDA Pool (Multi-Device Aware) # ============================================================================== # Each Task gets one pool per GPU device to prevent cross-device memory access. +# Pools are parameterized by safety level S (CuAdaptiveArrayPool{S}). const _CU_POOL_KEY = :ADAPTIVE_ARRAY_POOL_CUDA """ - get_task_local_cuda_pool() -> CuAdaptiveArrayPool + get_task_local_cuda_pool() -> CuAdaptiveArrayPool{S} Retrieves (or creates) the `CuAdaptiveArrayPool` for the current Task and current GPU device. @@ -18,6 +19,7 @@ a dictionary of pools (one per device) in task-local storage, ensuring that: ## Implementation Uses `Dict{Int, CuAdaptiveArrayPool}` in task-local storage, keyed by device ID. +Values are `CuAdaptiveArrayPool{S}` — use `_dispatch_pool_scope` for union splitting. """ @inline function AdaptiveArrayPools.get_task_local_cuda_pool() # 1. Get or create the pools dictionary @@ -33,7 +35,7 @@ Uses `Dict{Int, CuAdaptiveArrayPool}` in task-local storage, keyed by device ID. # 3. Get or create pool for this device pool = get(pools, dev_id, nothing) if pool === nothing - pool = CuAdaptiveArrayPool() # Constructor captures device_id + pool = CuAdaptiveArrayPool() # Constructor uses POOL_SAFETY_LV[] pools[dev_id] = pool end @@ -52,5 +54,34 @@ Useful for diagnostics or bulk operations across all devices. pools = Dict{Int, CuAdaptiveArrayPool}() task_local_storage(_CU_POOL_KEY, pools) end - return pools::Dict{Int, CuAdaptiveArrayPool} + return pools +end + +# ============================================================================== +# Safety Level Hook (called from set_safety_level! in base) +# ============================================================================== + +function AdaptiveArrayPools._set_cuda_safety_level_hook!(level::Int) + pools = get(task_local_storage(), _CU_POOL_KEY, nothing) + pools === nothing && return nothing + + # Check that no pool is inside an active scope + for (dev_id, old_pool) in pools + old = old_pool::CuAdaptiveArrayPool + depth = old._current_depth + depth != 1 && throw( + ArgumentError( + "set_safety_level! cannot be called inside an active @with_pool :cuda scope " * + "(device=$dev_id, depth=$depth)" + ) + ) + end + + # Replace all pools (collect keys to avoid mutating Dict during iteration) + for dev_id in collect(keys(pools)) + old = pools[dev_id]::CuAdaptiveArrayPool + pools[dev_id] = _make_cuda_pool(level, old) + end + + return nothing end diff --git a/ext/AdaptiveArrayPoolsCUDAExt/types.jl b/ext/AdaptiveArrayPoolsCUDAExt/types.jl index 4e5dd9fc..90f40741 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/types.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/types.jl @@ -83,9 +83,16 @@ const GPU_FIXED_SLOT_FIELDS = ( # ============================================================================== """ - CuAdaptiveArrayPool <: AbstractArrayPool + CuAdaptiveArrayPool{S} <: AbstractArrayPool -Multi-type GPU memory pool. Task-local and device-specific. +Multi-type GPU memory pool, parameterized by safety level `S` (0–3). + +## Safety Levels (CUDA-specific) +- `S=0`: Zero overhead — all safety branches eliminated by dead-code elimination +- `S=1`: Guard — poisoning (NaN/sentinel fill on released vectors) + cache invalidation + (CUDA equivalent of CPU's resize! structural invalidation) +- `S=2`: Full — poisoning + escape detection (`_validate_pool_return`) +- `S=3`: Debug — full + borrow call-site registry + debug messages ## Device Safety Each pool is bound to a specific GPU device. Using a pool on the wrong device @@ -95,8 +102,9 @@ causes undefined behavior. The `device_id` field tracks ownership. - Fixed slots for common GPU types (Float32 priority, includes Float16) - `others`: IdDict fallback for rare types - `device_id`: The GPU device this pool belongs to +- Borrow tracking fields (required by macro-injected field access at all S levels) """ -mutable struct CuAdaptiveArrayPool <: AbstractArrayPool +mutable struct CuAdaptiveArrayPool{S} <: AbstractArrayPool # Fixed Slots (GPU-optimized order) float32::CuTypedPool{Float32} float64::CuTypedPool{Float64} @@ -117,11 +125,16 @@ mutable struct CuAdaptiveArrayPool <: AbstractArrayPool # Device tracking (safety) device_id::Int + + # Borrow tracking (required: macro injects pool._pending_callsite = "..." as raw AST) + _pending_callsite::String + _pending_return_site::String + _borrow_log::Union{Nothing, IdDict{Any, String}} end -function CuAdaptiveArrayPool() +function CuAdaptiveArrayPool{S}() where {S} dev = CUDA.device() - return CuAdaptiveArrayPool( + return CuAdaptiveArrayPool{S}( CuTypedPool{Float32}(), CuTypedPool{Float64}(), CuTypedPool{Float16}(), @@ -134,6 +147,78 @@ function CuAdaptiveArrayPool() 1, # _current_depth (1 = global scope) [UInt16(0)], # _touched_type_masks: sentinel (no bits set) [false], # _touched_has_others: sentinel (no others) - CUDA.deviceid(dev) # Use public API + CUDA.deviceid(dev), + "", # _pending_callsite + "", # _pending_return_site + nothing # _borrow_log: lazily created at S >= 3 + ) +end + +"""Create pool at the current `POOL_SAFETY_LV[]` level.""" +CuAdaptiveArrayPool() = _make_cuda_pool(AdaptiveArrayPools.POOL_SAFETY_LV[]) + +# ============================================================================== +# Safety Level Dispatch +# ============================================================================== + +""" + _safety_level(pool::CuAdaptiveArrayPool{S}) -> Int + +Return compile-time constant safety level for CUDA pools. +""" +@inline AdaptiveArrayPools._safety_level(::CuAdaptiveArrayPool{S}) where {S} = S + +""" + _make_cuda_pool(s::Int) -> CuAdaptiveArrayPool{s} + +Function barrier: converts runtime `Int` to concrete `CuAdaptiveArrayPool{S}`. +Levels outside 0-3 are clamped (≤0 → 0, ≥3 → 3). +""" +@noinline function _make_cuda_pool(s::Int) + s <= 0 && return CuAdaptiveArrayPool{0}() + s == 1 && return CuAdaptiveArrayPool{1}() + s == 2 && return CuAdaptiveArrayPool{2}() + return CuAdaptiveArrayPool{3}() +end + +""" + _make_cuda_pool(s::Int, old::CuAdaptiveArrayPool) -> CuAdaptiveArrayPool{s} + +Create a new pool at safety level `s`, transferring cached arrays and scope state +from `old`. Only reference copies — no memory allocation for underlying GPU buffers. + +Transferred: all CuTypedPool slots, `others`, depth & touch tracking, device_id. +Reset: `_pending_callsite/return_site` (transient macro state), + `_borrow_log` (created fresh when `s >= 3`). +""" +@noinline function _make_cuda_pool(s::Int, old::CuAdaptiveArrayPool) + s <= 0 && return _transfer_cuda_pool(Val(0), old) + s == 1 && return _transfer_cuda_pool(Val(1), old) + s == 2 && return _transfer_cuda_pool(Val(2), old) + return _transfer_cuda_pool(Val(3), old) +end + +"""Transfer cached arrays and scope state from `old` pool into a new `CuAdaptiveArrayPool{V}`.""" +function _transfer_cuda_pool(::Val{V}, old::CuAdaptiveArrayPool) where {V} + return CuAdaptiveArrayPool{V}( + old.float32, old.float64, old.float16, + old.int32, old.int64, + old.complexf32, old.complexf64, old.bool, + old.others, + old._current_depth, + old._touched_type_masks, + old._touched_has_others, + old.device_id, + "", # _pending_callsite: reset + "", # _pending_return_site: reset + V >= 3 ? IdDict{Any, String}() : nothing # _borrow_log ) end + +"""Human-readable safety level label.""" +function _cuda_safety_label(s::Int) + s <= 0 && return "off" + s == 1 && return "guard" + s == 2 && return "full" + return "debug" +end diff --git a/ext/AdaptiveArrayPoolsCUDAExt/utils.jl b/ext/AdaptiveArrayPoolsCUDAExt/utils.jl index c8c22556..18d40512 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/utils.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/utils.jl @@ -57,11 +57,14 @@ end Print statistics for a CUDA adaptive array pool. """ -function AdaptiveArrayPools.pool_stats(pool::CuAdaptiveArrayPool; io::IO = stdout) - # Header with device info +function AdaptiveArrayPools.pool_stats(pool::CuAdaptiveArrayPool{S}; io::IO = stdout) where {S} + # Header with device info and safety level printstyled(io, "CuAdaptiveArrayPool", bold = true, color = :green) + printstyled(io, "{$S}", color = :yellow) printstyled(io, " (device ", color = :dark_gray) printstyled(io, pool.device_id, color = :blue) + printstyled(io, ", safety=", color = :dark_gray) + printstyled(io, _cuda_safety_label(S), color = :yellow) printstyled(io, ")\n", color = :dark_gray) has_content = false @@ -112,7 +115,7 @@ end # ============================================================================== # Compact one-line show -function Base.show(io::IO, pool::CuAdaptiveArrayPool) +function Base.show(io::IO, pool::CuAdaptiveArrayPool{S}) where {S} n_types = Ref(0) total_vectors = Ref(0) total_active = Ref(0) @@ -131,7 +134,7 @@ function Base.show(io::IO, pool::CuAdaptiveArrayPool) total_active[] += tp.n_active end - return print(io, "CuAdaptiveArrayPool(device=$(pool.device_id), types=$(n_types[]), slots=$(total_vectors[]), active=$(total_active[]))") + return print(io, "CuAdaptiveArrayPool{$S}(safety=$(_cuda_safety_label(S)), device=$(pool.device_id), types=$(n_types[]), slots=$(total_vectors[]), active=$(total_active[]))") end # Multi-line show diff --git a/src/debug.jl b/src/debug.jl index 256a1ac6..83eed243 100644 --- a/src/debug.jl +++ b/src/debug.jl @@ -241,7 +241,7 @@ function _validate_pool_return(val::AbstractSet, pool::AdaptiveArrayPool) end _validate_pool_return(val, ::DisabledPool) = nothing -# No-op fallback for non-CPU pools (e.g. CuAdaptiveArrayPool) that lack borrow tracking fields +# No-op fallback for pool types without specific validation (overridden by CUDA extension) _validate_pool_return(val, ::AbstractArrayPool) = nothing # ============================================================================== diff --git a/src/task_local_pool.jl b/src/task_local_pool.jl index 06b8a9ed..d316a703 100644 --- a/src/task_local_pool.jl +++ b/src/task_local_pool.jl @@ -144,22 +144,18 @@ end """ set_safety_level!(level::Int) -> AdaptiveArrayPool -Replace the task-local pool with a new `AdaptiveArrayPool{level}`, -preserving all cached arrays and scope state from the old pool. +Replace the task-local CPU pool (and CUDA pools if CUDA.jl is loaded) +with new pools at the given safety level, preserving cached arrays +and scope state (zero-copy transfer). -Cached TypedPool/BitTypedPool slots, the `others` IdDict, depth tracking, -and touch masks are transferred by reference (zero copy). -Transient borrow-tracking state (`_pending_callsite`, `_borrow_log`) is reset. - -One-time JIT cost for new `S` specialization. -Also updates `POOL_SAFETY_LV[]` so that `AdaptiveArrayPool()` creates pools -at the new level. +Also updates `POOL_SAFETY_LV[]` so that future `AdaptiveArrayPool()` / +`CuAdaptiveArrayPool()` constructors use the new level. ## Example ```julia -set_safety_level!(2) # Enable full safety (escape detection + poisoning) +set_safety_level!(2) # Enable full safety on CPU + all GPU devices # ... run suspicious code ... -set_safety_level!(0) # Back to zero overhead — cached arrays still available +set_safety_level!(0) # Back to zero overhead everywhere ``` See also: [`_safety_level`], [`POOL_SAFETY_LV`] @@ -178,9 +174,14 @@ function set_safety_level!(level::Int) POOL_SAFETY_LV[] = level new_pool = old_pool === nothing ? _make_pool(level) : _make_pool(level, old_pool::AdaptiveArrayPool) task_local_storage(_POOL_KEY, new_pool) + # Update CUDA pools if extension is loaded (no-op otherwise) + _set_cuda_safety_level_hook!(level) return new_pool end +# Hook for CUDA extension to override. No-op when CUDA is not loaded. +_set_cuda_safety_level_hook!(::Int) = nothing + # ============================================================================== # CUDA Pool Stubs (overridden by extension when CUDA is loaded) # ============================================================================== diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl index cc2fb158..2c016a50 100644 --- a/test/cuda/runtests.jl +++ b/test/cuda/runtests.jl @@ -41,4 +41,5 @@ else include("test_display.jl") include("test_convenience.jl") include("test_disabled_pool.jl") + include("test_cuda_safety.jl") end diff --git a/test/cuda/test_cuda_safety.jl b/test/cuda/test_cuda_safety.jl new file mode 100644 index 00000000..a5fbc5d4 --- /dev/null +++ b/test/cuda/test_cuda_safety.jl @@ -0,0 +1,659 @@ +import AdaptiveArrayPools: PoolRuntimeEscapeError, PoolEscapeError, _safety_level, POOL_DEBUG + +const _make_cuda_pool = ext._make_cuda_pool + +# Opaque identity — defeats compile-time escape analysis +_cuda_test_leak(x) = x + +@testset "CUDA Safety Dispatch (CuAdaptiveArrayPool{S})" begin + + # ============================================================================== + # Type parameterization basics + # ============================================================================== + + @testset "CuAdaptiveArrayPool{S} construction and _safety_level" begin + p0 = _make_cuda_pool(0) + p1 = _make_cuda_pool(1) + p2 = _make_cuda_pool(2) + p3 = _make_cuda_pool(3) + + @test p0 isa CuAdaptiveArrayPool{0} + @test p1 isa CuAdaptiveArrayPool{1} + @test p2 isa CuAdaptiveArrayPool{2} + @test p3 isa CuAdaptiveArrayPool{3} + + @test _safety_level(p0) == 0 + @test _safety_level(p1) == 1 + @test _safety_level(p2) == 2 + @test _safety_level(p3) == 3 + + # Borrow fields exist at all levels (required by macro-injected field access) + @test hasfield(typeof(p0), :_pending_callsite) + @test hasfield(typeof(p0), :_pending_return_site) + @test hasfield(typeof(p0), :_borrow_log) + end + + # ============================================================================== + # Level 0: No poisoning, no validation + # ============================================================================== + + @testset "Level 0: no poisoning on rewind" begin + pool = _make_cuda_pool(0) + checkpoint!(pool) + v = acquire!(pool, Float32, 10) + CUDA.fill!(v, 42.0f0) + rewind!(pool) + + # With safety off, backing vector still has valid data + @test length(pool.float32.vectors[1]) >= 10 + # Data should still be there (no poisoning) + checkpoint!(pool) + v2 = acquire!(pool, Float32, 10) + @test all(x -> x == 42.0f0, Array(v2)) + rewind!(pool) + end + + # ============================================================================== + # Level 1: Poisoning (CUDA equivalent of CPU's resize! invalidation) + # ============================================================================== + + @testset "Level 1: Float32 poisoned with NaN on rewind" begin + pool = _make_cuda_pool(1) + checkpoint!(pool) + v = acquire!(pool, Float32, 10) + CUDA.fill!(v, 42.0f0) + rewind!(pool) + + # Backing vector should be poisoned with NaN (NOT freed via resize!) + @test length(pool.float32.vectors[1]) >= 10 + cpu_data = Array(pool.float32.vectors[1]) + @test all(isnan, cpu_data[1:10]) + + # Re-acquire: should see poisoned data + checkpoint!(pool) + v2 = acquire!(pool, Float32, 10) + @test all(isnan, Array(v2)) + rewind!(pool) + end + + @testset "Level 1: Int32 poisoned with typemax on rewind" begin + pool = _make_cuda_pool(1) + checkpoint!(pool) + v = acquire!(pool, Int32, 8) + CUDA.fill!(v, Int32(42)) + rewind!(pool) + + cpu_data = Array(pool.int32.vectors[1]) + @test all(==(typemax(Int32)), cpu_data[1:8]) + end + + @testset "Level 1: ComplexF32 poisoned with NaN on rewind" begin + pool = _make_cuda_pool(1) + checkpoint!(pool) + v = acquire!(pool, ComplexF32, 8) + CUDA.fill!(v, ComplexF32(1.0f0 + 2.0f0im)) + rewind!(pool) + + cpu_data = Array(pool.complexf32.vectors[1]) + @test all(z -> isnan(real(z)) && isnan(imag(z)), cpu_data[1:8]) + end + + @testset "Level 1: Bool poisoned with true on rewind" begin + pool = _make_cuda_pool(1) + checkpoint!(pool) + v = acquire!(pool, Bool, 16) + CUDA.fill!(v, false) + rewind!(pool) + + cpu_data = Array(pool.bool.vectors[1]) + @test all(==(true), cpu_data[1:16]) + end + + @testset "Level 1: Float16 poisoned with NaN on rewind" begin + pool = _make_cuda_pool(1) + checkpoint!(pool) + v = acquire!(pool, Float16, 10) + CUDA.fill!(v, Float16(42.0)) + rewind!(pool) + + cpu_data = Array(pool.float16.vectors[1]) + @test all(isnan, cpu_data[1:10]) + end + + @testset "Level 1: N-way cache invalidated on poisoned rewind" begin + pool = _make_cuda_pool(1) + checkpoint!(pool) + v = acquire!(pool, Float32, 10) + CUDA.fill!(v, 1.0f0) + rewind!(pool) + + # Cached views should be cleared (nothing) after poisoning + base = 0 * ext.CACHE_WAYS + for k in 1:ext.CACHE_WAYS + @test pool.float32.views[base + k] === nothing + end + end + + @testset "Level 1: no escape detection" begin + # Level 1 should NOT throw on escape (that's Level 2+) + pool = _make_cuda_pool(1) + result = begin + checkpoint!(pool) + v = acquire!(pool, Float32, 10) + rewind!(pool) + v # "escaping" — should not throw at Level 1 + end + @test result isa CuArray + end + + # ============================================================================== + # Level 0: Verify no poisoning + # ============================================================================== + + @testset "Level 0: no poisoning (verify data survives rewind)" begin + pool = _make_cuda_pool(0) + checkpoint!(pool) + v = acquire!(pool, Float32, 10) + CUDA.fill!(v, 42.0f0) + rewind!(pool) + + # Data should NOT be poisoned at Level 0 + cpu_data = Array(pool.float32.vectors[1]) + @test all(x -> x == 42.0f0, cpu_data[1:10]) + end + + # ============================================================================== + # Level 2: Escape detection + # ============================================================================== + + @testset "Level 2: escape detection catches CuArray leak" begin + pool = _make_cuda_pool(2) + @test_throws PoolRuntimeEscapeError begin + checkpoint!(pool) + try + v = acquire!(pool, Float32, 10) + # Simulate what _validate_pool_return does + AdaptiveArrayPools._validate_pool_return(_cuda_test_leak(v), pool) + finally + rewind!(pool) + end + end + end + + @testset "Level 2: safe scalar return does not throw" begin + pool = _make_cuda_pool(2) + checkpoint!(pool) + try + v = acquire!(pool, Float32, 10) + CUDA.fill!(v, 3.0f0) + result = sum(Array(v)) # scalar — safe + AdaptiveArrayPools._validate_pool_return(result, pool) + @test result == 30.0f0 + finally + rewind!(pool) + end + end + + @testset "Level 2: escape detection with Tuple containing CuArray" begin + pool = _make_cuda_pool(2) + @test_throws PoolRuntimeEscapeError begin + checkpoint!(pool) + try + v = acquire!(pool, Float32, 10) + val = (42, _cuda_test_leak(v)) + AdaptiveArrayPools._validate_pool_return(val, pool) + finally + rewind!(pool) + end + end + end + + @testset "Level 2: escape detection with Dict containing CuArray" begin + pool = _make_cuda_pool(2) + @test_throws PoolRuntimeEscapeError begin + checkpoint!(pool) + try + v = acquire!(pool, Float32, 10) + val = Dict(:data => _cuda_test_leak(v)) + AdaptiveArrayPools._validate_pool_return(val, pool) + finally + rewind!(pool) + end + end + end + + @testset "Level 0 and 1: no escape detection" begin + for lv in (0, 1) + pool = _make_cuda_pool(lv) + checkpoint!(pool) + try + v = acquire!(pool, Float32, 10) + # Should NOT throw — escape detection requires Level 2+ + AdaptiveArrayPools._validate_pool_return(_cuda_test_leak(v), pool) + finally + rewind!(pool) + end + end + end + + # ============================================================================== + # Level 3: Borrow tracking + # ============================================================================== + + @testset "Level 3: borrow fields functional" begin + pool = _make_cuda_pool(3) + @test pool._pending_callsite == "" + @test pool._pending_return_site == "" + @test pool._borrow_log === nothing # lazily created + end + + @testset "Level 3: _set_pending_callsite! works" begin + pool = _make_cuda_pool(3) + AdaptiveArrayPools._set_pending_callsite!(pool, "test.jl:42\nacquire!(pool, Float32, 10)") + @test pool._pending_callsite == "test.jl:42\nacquire!(pool, Float32, 10)" + + # At Level 0, should be no-op + pool0 = _make_cuda_pool(0) + AdaptiveArrayPools._set_pending_callsite!(pool0, "should not be set") + @test pool0._pending_callsite == "" + end + + @testset "Level 3: _maybe_record_borrow! records callsite" begin + pool = _make_cuda_pool(3) + checkpoint!(pool) + tp = get_typed_pool!(pool, Float32) + + # Set pending callsite, then acquire to increment n_active + AdaptiveArrayPools._set_pending_callsite!(pool, "test.jl:99\nacquire!(pool, Float32, 5)") + acquire!(pool, Float32, 5) + + # The borrow log should now have an entry + @test pool._borrow_log !== nothing + @test length(pool._borrow_log) >= 1 + + rewind!(pool) + end + + # ============================================================================== + # set_safety_level! — all-device replacement + # ============================================================================== + + @testset "set_safety_level! replaces pool with state preservation" begin + # Get current pool (creates one at default safety level) + pool = get_task_local_cuda_pool() + reset!(pool) + + # Populate with some data + checkpoint!(pool) + v = acquire!(pool, Float32, 100) + CUDA.fill!(v, 1.0f0) + rewind!(pool) + + # Change safety level + set_safety_level!(2) + new_pool = get_task_local_cuda_pool() + + @test new_pool isa CuAdaptiveArrayPool{2} + @test _safety_level(new_pool) == 2 + # Cached vectors should be preserved (same object reference) + @test new_pool.float32.vectors[1] === pool.float32.vectors[1] + + # Restore + set_safety_level!(0) + @test get_task_local_cuda_pool() isa CuAdaptiveArrayPool{0} + end + + @testset "set_safety_level! rejects inside active scope" begin + pool = get_task_local_cuda_pool() + checkpoint!(pool) + try + @test_throws ArgumentError set_safety_level!(2) + finally + rewind!(pool) + end + end + + # ============================================================================== + # Nested scopes: inner poisoned, outer valid + # ============================================================================== + + @testset "Nested scopes: inner poisoned, outer still valid" begin + pool = _make_cuda_pool(1) + + checkpoint!(pool) + v_outer = acquire!(pool, Float32, 10) + CUDA.fill!(v_outer, 1.0f0) + + # Inner scope + checkpoint!(pool) + v_inner = acquire!(pool, Float32, 20) + CUDA.fill!(v_inner, 2.0f0) + rewind!(pool) + + # Inner should be poisoned (slot 2 released) + cpu_inner = Array(pool.float32.vectors[2]) + @test all(isnan, cpu_inner[1:20]) + + # Outer should still be valid (slot 1 not released) + cpu_outer = Array(v_outer) + @test all(x -> x == 1.0f0, cpu_outer) + + rewind!(pool) + # Now outer is also poisoned + cpu_outer_after = Array(pool.float32.vectors[1]) + @test all(isnan, cpu_outer_after[1:10]) + end + + # ============================================================================== + # reset! with safety + # ============================================================================== + + @testset "reset! clears borrow tracking state" begin + pool = _make_cuda_pool(3) + pool._pending_callsite = "test" + pool._pending_return_site = "test" + pool._borrow_log = IdDict{Any, String}() + + reset!(pool) + + @test pool._pending_callsite == "" + @test pool._pending_return_site == "" + @test pool._borrow_log === nothing + end + + # ============================================================================== + # Display includes {S} and safety label + # ============================================================================== + + @testset "show includes {S} and safety label" begin + pool = _make_cuda_pool(2) + s = sprint(show, pool) + @test occursin("{2}", s) + @test occursin("safety=full", s) + + pool0 = _make_cuda_pool(0) + s0 = sprint(show, pool0) + @test occursin("{0}", s0) + @test occursin("safety=off", s0) + end + + # ============================================================================== + # POOL_DEBUG backward compat with CUDA + # ============================================================================== + + @testset "POOL_DEBUG backward compat triggers CUDA escape detection" begin + old_debug = POOL_DEBUG[] + + POOL_DEBUG[] = true + pool = _make_cuda_pool(0) # Safety off, but POOL_DEBUG overrides + @test_throws PoolRuntimeEscapeError begin + checkpoint!(pool) + try + v = acquire!(pool, Float32, 10) + AdaptiveArrayPools._validate_pool_return(_cuda_test_leak(v), pool) + finally + rewind!(pool) + end + end + + POOL_DEBUG[] = old_debug + end + + # ============================================================================== + # Fallback types (pool.others) poisoning + # ============================================================================== + + @testset "Fallback type (UInt8) poisoned on rewind" begin + pool = _make_cuda_pool(1) + checkpoint!(pool) + v = acquire!(pool, UInt8, 16) + CUDA.fill!(v, UInt8(42)) + rewind!(pool) + + tp = pool.others[UInt8] + cpu_data = Array(tp.vectors[1]) + @test all(==(typemax(UInt8)), cpu_data[1:16]) + end + + # ============================================================================== + # @with_pool :cuda integration with safety + # ============================================================================== + + @testset "@with_pool :cuda with escape detection" begin + old_debug = POOL_DEBUG[] + POOL_DEBUG[] = true # Use POOL_DEBUG to trigger on any safety level + + @test_throws PoolRuntimeEscapeError @with_pool :cuda pool begin + v = acquire!(pool, Float32, 10) + _cuda_test_leak(v) + end + + POOL_DEBUG[] = old_debug + end + + @testset "@with_pool :cuda safe return" begin + old_debug = POOL_DEBUG[] + POOL_DEBUG[] = true + + result = @with_pool :cuda pool begin + v = acquire!(pool, Float32, 10) + CUDA.fill!(v, 3.0f0) + sum(Array(v)) # scalar return — safe + end + @test result == 30.0f0 + + POOL_DEBUG[] = old_debug + end + + # ============================================================================== + # Compile-time escape detection (@with_pool :cuda) + # ============================================================================== + + @testset "Compile-time: direct CuArray escape caught at macro expansion" begin + @test_throws PoolEscapeError @macroexpand @with_pool :cuda pool begin + v = acquire!(pool, Float32, 10) + v # direct escape in tail position + end + end + + @testset "Compile-time: safe scalar return passes" begin + # Should NOT throw at macro expansion time + ex = @macroexpand @with_pool :cuda pool begin + v = acquire!(pool, Float32, 10) + sum(Array(v)) # scalar — safe + end + @test ex isa Expr + end + + @testset "Compile-time: zeros!/ones! escape caught" begin + @test_throws PoolEscapeError @macroexpand @with_pool :cuda pool begin + v = zeros!(pool, Float32, 10) + v + end + end + + # ============================================================================== + # @with_pool :cuda at native Level 2 (no POOL_DEBUG hack) + # ============================================================================== + + @testset "@with_pool :cuda Level 2 escape detection (native S=2)" begin + set_safety_level!(2) + + @test_throws PoolRuntimeEscapeError @with_pool :cuda pool begin + v = acquire!(pool, Float32, 10) + _cuda_test_leak(v) + end + + set_safety_level!(0) + end + + @testset "@with_pool :cuda Level 2 safe return (native S=2)" begin + set_safety_level!(2) + + result = @with_pool :cuda pool begin + v = acquire!(pool, Float32, 10) + CUDA.fill!(v, 5.0f0) + sum(Array(v)) + end + @test result == 50.0f0 + + set_safety_level!(0) + end + + @testset "@with_pool :cuda Level 1 no escape detection (native S=1)" begin + set_safety_level!(1) + + # Level 1 should NOT trigger escape detection + result = @with_pool :cuda pool begin + v = acquire!(pool, Float32, 10) + _cuda_test_leak(v) + end + @test result isa CuArray + + set_safety_level!(0) + end + + # ============================================================================== + # Level 3 borrow tracking via macro path + # ============================================================================== + + @testset "@with_pool :cuda Level 3 escape error includes callsite" begin + set_safety_level!(3) + + err = try + @with_pool :cuda pool begin + v = acquire!(pool, Float32, 10) + _cuda_test_leak(v) + end + nothing + catch e + e + end + + @test err isa PoolRuntimeEscapeError + @test err.callsite !== nothing + @test contains(err.callsite, ":") # "file:line" format + + set_safety_level!(0) + end + + @testset "@with_pool :cuda Level 3 callsite includes expression text" begin + set_safety_level!(3) + + err = try + @with_pool :cuda pool begin + v = zeros!(pool, Float32, 10) + _cuda_test_leak(v) + end + nothing + catch e + e + end + + @test err isa PoolRuntimeEscapeError + @test err.callsite !== nothing + @test contains(err.callsite, "\n") + @test contains(err.callsite, "zeros!(pool, Float32, 10)") + + set_safety_level!(0) + end + + @testset "LV<3 does not create borrow log on CUDA" begin + for lv in (0, 1, 2) + pool = _make_cuda_pool(lv) + checkpoint!(pool) + _ = acquire!(pool, Float32, 10) + @test pool._borrow_log === nothing + rewind!(pool) + end + end + + @testset "LV=3 creates borrow log on CUDA acquire" begin + pool = _make_cuda_pool(3) + checkpoint!(pool) + _ = acquire!(pool, Float32, 10) + @test pool._borrow_log !== nothing + @test pool._borrow_log isa IdDict + rewind!(pool) + end + + # ============================================================================== + # Error message content (showerror) + # ============================================================================== + + @testset "showerror: CuArray escape error message format" begin + # LV≥2 without callsite → "Tip: set LV=3" + err = PoolRuntimeEscapeError("CuArray{Float32, 1}", "Float32", nothing, nothing) + io = IOBuffer() + showerror(io, err) + msg = String(take!(io)) + + @test contains(msg, "PoolEscapeError") + @test contains(msg, "CuArray{Float32, 1}") + @test contains(msg, "Float32") + @test contains(msg, "POOL_SAFETY_LV ≥ 2") + @test contains(msg, "Tip:") + @test contains(msg, "POOL_SAFETY_LV[] = 3") + end + + @testset "showerror: CuArray with callsite (LV≥3)" begin + err = PoolRuntimeEscapeError( + "CuArray{Float32, 1}", "Float32", + "test_cuda.jl:42\nacquire!(pool, Float32, 10)", nothing + ) + io = IOBuffer() + showerror(io, err) + msg = String(take!(io)) + + @test contains(msg, "acquired at") + @test contains(msg, "test_cuda.jl:42") + @test contains(msg, "acquire!(pool, Float32, 10)") + @test contains(msg, "POOL_SAFETY_LV ≥ 3") + @test !contains(msg, "Tip:") # No tip when callsite is present + end + + # ============================================================================== + # Function form: @with_pool :cuda pool function ... + # ============================================================================== + + @testset "Function form: escape detection with explicit return" begin + set_safety_level!(2) + + @with_pool :cuda pool function _cuda_test_return_escape() + v = acquire!(pool, Float32, 10) + return _cuda_test_leak(v) + end + + @test_throws PoolRuntimeEscapeError _cuda_test_return_escape() + + set_safety_level!(0) + end + + @testset "Function form: safe scalar return passes" begin + set_safety_level!(2) + + @with_pool :cuda pool function _cuda_test_safe_return() + v = acquire!(pool, Float32, 5) + CUDA.fill!(v, 4.0f0) + return sum(Array(v)) + end + + @test _cuda_test_safe_return() == 20.0f0 + + set_safety_level!(0) + end + + @testset "Function form: bare return (nothing) passes" begin + set_safety_level!(2) + + @with_pool :cuda pool function _cuda_test_bare_return() + _ = acquire!(pool, Float32, 10) + return + end + + @test _cuda_test_bare_return() === nothing + + set_safety_level!(0) + end + +end # CUDA Safety Dispatch