diff --git a/docs/make.jl b/docs/make.jl
index feb85de7..279ca48d 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -38,6 +38,34 @@ const README_PATH_MAPPINGS = [
     (r"\(docs/safety\.md(#[^)]+)?\)", s"(basics/safety-rules.md\1)"),
 ]
 
+"""
+Inject Google Search Console verification meta tag into generated HTML files.
+This is enabled only when `ENV["GOOGLE_SITE_VERIFICATION"]` is set.
+"""
+function inject_google_site_verification!(build_dir::String)
+    token = strip(get(ENV, "GOOGLE_SITE_VERIFICATION", ""))
+    isempty(token) && return
+
+    safe_token = replace(token, '"' => "&quot;")
+    meta_tag = "<meta name=\"google-site-verification\" content=\"$(safe_token)\" />"
+    injected = 0
+
+    for (root, _, files) in walkdir(build_dir)
+        for file in files
+            endswith(file, ".html") || continue
+            path = joinpath(root, file)
+            html = read(path, String)
+            occursin("google-site-verification", html) && continue
+            occursin("</head>", html) || continue
+
+            write_if_changed(path, replace(html, "</head>" => "$(meta_tag)\n</head>"; count = 1))
+            injected += 1
+        end
+    end
+
+    return @info "Injected google-site-verification meta tag" files = injected build_dir = build_dir
+end
+
 """
 Rewrite relative paths in README.md for Documenter structure.
 
@@ -74,9 +102,15 @@ makedocs(
     sitename = "AdaptiveArrayPools.jl",
     authors = "Min-Gu Yoo",
     modules = [AdaptiveArrayPools],
+    # servedocs() sets root to docs/ which conflicts with project-root remotes.
+    # Enable GitHub source links only in CI where makedocs root matches git root.
+    remotes = get(ENV, "CI", nothing) == "true" ?
+        Dict(dirname(@__DIR__) => (Documenter.Remotes.GitHub("ProjectTorreyPines", "AdaptiveArrayPools.jl"), "master")) :
+        nothing,
     format = Documenter.HTML(
         prettyurls = get(ENV, "CI", nothing) == "true",
         canonical = "https://projecttorreypines.github.io/AdaptiveArrayPools.jl",
+        edit_link = :commit,
         assets = String[],
     ),
     pages = [
@@ -92,6 +126,7 @@ makedocs(
             "Multi-threading" => "features/multi-threading.md",
         ],
         "Features" => [
+            "Pool Safety" => "features/safety.md",
             "`@maybe_with_pool`" => "features/maybe-with-pool.md",
             "Bit Arrays" => "features/bit-arrays.md",
             "CUDA Support" => "features/cuda-support.md",
@@ -112,6 +147,8 @@ makedocs(
     warnonly = [:cross_references, :missing_docs],
 )
 
+inject_google_site_verification!(joinpath(@__DIR__, "build"))
+
 deploydocs(
     repo = "github.com/ProjectTorreyPines/AdaptiveArrayPools.jl.git",
     devbranch = "master",
diff --git a/docs/src/features/safety.md b/docs/src/features/safety.md
new file mode 100644
index 00000000..72b387b3
--- /dev/null
+++ b/docs/src/features/safety.md
@@ -0,0 +1,131 @@
+# Pool Safety
+
+AdaptiveArrayPools catches pool-escape bugs at **two levels**: compile-time (macro analysis) and runtime (configurable safety levels).
+
+## Compile-Time Detection
+
+The `@with_pool` macro statically analyzes your code and **rejects** any expression that would return a pool-backed array. This catches the most common mistakes at zero runtime cost.
+
+```julia
+# Direct array escape — caught at macro expansion time
+@with_pool pool begin
+    v = acquire!(pool, Float64, 100)
+    v  # ← ERROR: v escapes the pool scope
+end
+```
+
+This would throw an error message as follows:
+```
+ERROR: LoadError: PoolEscapeError (compile-time)
+
+  The following variable escapes the @with_pool scope:
+
+    v  ← pool-acquired view
+
+  Declarations:
+    [1]  v = acquire!(pool, Float64, 100)  [myfile.jl:2]
+
+  Escaping return:
+    [1]  v  [myfile.jl:3]
+
+  Fix: Use collect(v) to return owned copies.
+       Or use a regular Julia array (zeros()/Array{T}()) if it must outlive the pool scope.
+
+in expression starting at myfile.jl:1
+```
+
+The analyzer tracks aliases, containers, and convenience wrappers:
+
+```julia
+# All of these are caught at compile time:
+@with_pool pool begin
+    v = zeros!(pool, Float64, 10)
+    w = v            # alias of pool variable
+    t = (1, v)       # tuple wrapping pool array
+    w                # ← ERROR
+end
+
+@with_pool pool function bad()
+    A = acquire!(pool, Float64, 3, 3)
+    return A         # ← ERROR (explicit return)
+end
+```
+
+Safe patterns pass without error:
+
+```julia
+@with_pool pool begin
+    v = acquire!(pool, Float64, 100)
+    sum(v)               # ✅ scalar result
+end
+
+@with_pool pool begin
+    v = acquire!(pool, Float64, 100)
+    collect(v)           # ✅ owned copy
+end
+```
+
+## Runtime Safety Levels
+
+For bugs the compiler can't catch (e.g., values hidden behind opaque function calls), runtime safety provides configurable protection via the type parameter `S` in `AdaptiveArrayPool{S}`.
+
+### Level Overview
+
+| Level | Name | CPU | CUDA | Overhead |
+|-------|------|-----|------|----------|
+| **0** | off | No-op (all branches dead-code-eliminated) | Same | Zero |
+| **1** | guard | `resize!(v,0)` + `setfield!` invalidation | NaN/sentinel poisoning + cache clear | ~5ns/slot |
+| **2** | full | Level 1 + data poisoning + escape detection at scope exit | Level 1 + device-pointer overlap check | Moderate |
+| **3** | debug | Level 2 + acquire call-site tracking | Same | Moderate+ |
+
+### Why CPU and CUDA Differ at Level 1
+
+Both achieve the same goal — **make stale references fail loudly** — but use different mechanisms:
+
+| | CPU | CUDA |
+|---|-----|------|
+| **Strategy** | Structural invalidation | Data poisoning |
+| **Mechanism** | `resize!(v, 0)` shrinks backing vector to length 0; `setfield!(:size, (0,))` zeroes the array dimensions | `CUDA.fill!(v, NaN)` / `typemax` / `true` fills backing CuVector with sentinel values |
+| **Stale access result** | `BoundsError` (array has length 0) | Reads `NaN` or `typemax` (obviously wrong data) |
+| **Why not the other way?** | CPU `resize!` is cheap (~0 cost) | CUDA `resize!` calls `CUDA.Mem.free()` — destroys the pooled VRAM allocation |
+| **Cache invalidation** | View length/dims zeroed | N-way view cache entries cleared to `nothing` |
+
+### Setting the Level
+
+```julia
+using AdaptiveArrayPools
+
+# Enable full safety on CPU + all GPU devices (preserves cached arrays, zero-copy)
+set_safety_level!(2)
+
+# Back to zero overhead everywhere
+set_safety_level!(0)
+```
+
+The pool type parameter `S` is a compile-time constant. At `S=0`, the JIT eliminates all safety branches via dead-code elimination — true zero overhead with no `Ref` reads or conditional branches.
+
+### Data Poisoning (Level 2+, CPU)
+
+At Level 1, CPU relies on **structural invalidation** (`resize!` + `setfield!`) which makes stale views throw `BoundsError`. At Level 2+, CPU additionally **poisons** the backing vector data with sentinel values (`NaN`, `typemax`, all-`true` for `BitVector`) *before* structural invalidation. This catches stale access through `unsafe_acquire!` wrappers on Julia 1.10 where `setfield!` on Array is unavailable.
+
+CUDA already poisons at Level 1 (its primary invalidation strategy), so no additional poisoning step is needed at Level 2.
+
+### Escape Detection (Level 2+)
+
+At every `@with_pool` scope exit, the return value is inspected for overlap with pool-backed memory. Recursively checks `Tuple`, `NamedTuple`, `Dict`, `Pair`, `Set`, and `AbstractArray` elements.
+
+Level 3 additionally records each `acquire!` call-site, so the error message pinpoints the exact source line and expression that allocated the escaping array.
+
+### Legacy: `POOL_DEBUG`
+
+`POOL_DEBUG[] = true` triggers Level 2 escape detection regardless of `S`. For new code, prefer `set_safety_level!(2)`.
+
+## Recommended Workflow
+
+```julia
+# Development / Testing: catch bugs early
+set_safety_level!(2)   # or 3 for call-site info in error messages
+
+# Production: zero overhead
+set_safety_level!(0)   # all safety branches eliminated by the compiler
+```
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl
index bef3f3a3..8abb3672 100644
--- a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl
+++ b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl
@@ -40,6 +40,9 @@ include("task_local_pool.jl")
 # State management (checkpoint!, rewind!, reset!, empty!)
 include("state.jl")
 
+# Safety: poisoning, escape detection, borrow tracking
+include("debug.jl")
+
 # Display & statistics (pool_stats, show)
 include("utils.jl")
 
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/debug.jl b/ext/AdaptiveArrayPoolsCUDAExt/debug.jl
new file mode 100644
index 00000000..2ac371f8
--- /dev/null
+++ b/ext/AdaptiveArrayPoolsCUDAExt/debug.jl
@@ -0,0 +1,256 @@
+# ==============================================================================
+# CUDA Safety: Poisoning, Escape Detection, Borrow Tracking
+# ==============================================================================
+# CUDA-specific safety implementations for CuAdaptiveArrayPool{S}.
+#
+# Safety levels on CUDA differ from CPU:
+# - Level 0: Zero overhead (all branches dead-code-eliminated)
+# - Level 1: Poisoning (NaN/sentinel fill) + N-way cache invalidation
+#            (CUDA equivalent of CPU's resize!/setfield! structural invalidation)
+# - Level 2: Poisoning + escape detection (_validate_pool_return for CuArrays)
+# - Level 3: Full + borrow call-site registry + debug messages
+#
+# Key difference: CPU uses resize!(v, 0) at Level 1 to invalidate stale SubArrays.
+# On CUDA, resize!(CuVector, 0) frees GPU memory, so we use poisoning instead.
+
+using AdaptiveArrayPools: _safety_level, _validate_pool_return,
+    _set_pending_callsite!, _maybe_record_borrow!,
+    _invalidate_released_slots!,
+    _throw_pool_escape_error,
+    POOL_DEBUG, POOL_SAFETY_LV,
+    PoolRuntimeEscapeError
+
+# ==============================================================================
+# Poisoning: Fill released CuVectors with sentinel values (Level 1+)
+# ==============================================================================
+
+_cuda_poison_value(::Type{T}) where {T <: AbstractFloat} = T(NaN)
+_cuda_poison_value(::Type{T}) where {T <: Integer} = typemax(T)
+_cuda_poison_value(::Type{Complex{T}}) where {T} = Complex{T}(_cuda_poison_value(T), _cuda_poison_value(T))
+_cuda_poison_value(::Type{Bool}) = true
+_cuda_poison_value(::Type{T}) where {T} = zero(T)  # generic fallback
+
+"""
+    _cuda_poison_fill!(v::CuVector{T})
+
+Fill a CuVector with a detectable sentinel value (NaN for floats, typemax for ints).
+@noinline to avoid inlining GPU kernel launch overhead into hot rewind paths.
+"""
+@noinline function _cuda_poison_fill!(v::CuVector{T}) where {T}
+    length(v) > 0 && CUDA.fill!(v, _cuda_poison_value(T))
+    return nothing
+end
+
+# ==============================================================================
+# _invalidate_released_slots! for CuTypedPool (Level 1+)
+# ==============================================================================
+#
+# Overrides the no-op fallback in base. On CUDA:
+# - Level 0: no-op (base _rewind_typed_pool! gates with S >= 1, so never called)
+# - Level 1+: poison released CuVectors + invalidate N-way view cache
+# - NO resize!(cuv, 0) — would free GPU memory
+
+@noinline function AdaptiveArrayPools._invalidate_released_slots!(
+        tp::CuTypedPool{T}, old_n_active::Int, S::Int
+    ) where {T}
+    new_n = tp.n_active
+    for i in (new_n + 1):old_n_active
+        # Poison released CuVectors with sentinel values
+        _cuda_poison_fill!(@inbounds tp.vectors[i])
+        # Invalidate N-way cache entries for released slots.
+        # After poisoning, cached views point at poisoned data — clear them so
+        # re-acquire creates fresh views instead of returning stale poisoned ones.
+        base = (i - 1) * CACHE_WAYS
+        for k in 1:CACHE_WAYS
+            @inbounds tp.views[base + k] = nothing
+            @inbounds tp.view_dims[base + k] = nothing
+        end
+    end
+    return nothing
+end
+
+# ==============================================================================
+# Borrow Tracking: Call-site recording (Level 3)
+# ==============================================================================
+#
+# Overrides the no-op AbstractArrayPool fallbacks.
+# The macro injects pool._pending_callsite = "file:line\nexpr" before acquire calls.
+# These functions flush that pending info into the borrow log.
+
+"""Record pending callsite for borrow tracking (compiles to no-op when S < 3)."""
+@inline function AdaptiveArrayPools._set_pending_callsite!(pool::CuAdaptiveArrayPool{S}, msg::String) where {S}
+    S >= 3 && isempty(pool._pending_callsite) && (pool._pending_callsite = msg)
+    return nothing
+end
+
+"""Flush pending callsite into borrow log (compiles to no-op when S < 3)."""
+@inline function AdaptiveArrayPools._maybe_record_borrow!(pool::CuAdaptiveArrayPool{S}, tp::AbstractTypedPool) where {S}
+    S >= 3 && _cuda_record_borrow_from_pending!(pool, tp)
+    return nothing
+end
+
+@noinline function _cuda_record_borrow_from_pending!(pool::CuAdaptiveArrayPool, tp::AbstractTypedPool)
+    callsite = pool._pending_callsite
+    isempty(callsite) && return nothing
+    log = pool._borrow_log
+    if log === nothing
+        log = IdDict{Any, String}()
+        pool._borrow_log = log
+    end
+    @inbounds log[tp.vectors[tp.n_active]] = callsite
+    pool._pending_callsite = ""   # Clear for next acquire
+    return nothing
+end
+
+@noinline function _cuda_lookup_borrow_callsite(pool::CuAdaptiveArrayPool, v)::Union{Nothing, String}
+    log = pool._borrow_log
+    log === nothing && return nothing
+    return get(log, v, nothing)
+end
+
+# ==============================================================================
+# Escape Detection: _validate_pool_return for CuArrays (Level 2+)
+# ==============================================================================
+#
+# CuArray views share the same device buffer, so device pointer overlap
+# detection works correctly. pointer(::CuArray) returns CuPtr{T}.
+
+function AdaptiveArrayPools._validate_pool_return(val, pool::CuAdaptiveArrayPool{S}) where {S}
+    (S >= 2 || POOL_DEBUG[]) || return nothing
+    _validate_cuda_return(val, pool)
+    return nothing
+end
+
+function _validate_cuda_return(val, pool::CuAdaptiveArrayPool)
+    # Note: Container recursion (Tuple, NamedTuple, Pair, Dict, Set, AbstractArray)
+    # is duplicated from CPU's _validate_pool_return dispatch chain (src/debug.jl).
+    # CPU uses multiple dispatch on pool::AdaptiveArrayPool for each container type,
+    # which doesn't cover CuAdaptiveArrayPool. We could add CuAdaptiveArrayPool methods
+    # for each container, but that creates 6+ method definitions vs. this single function.
+    # Trade-off: if a new container type is added to the CPU path, it must also be added here.
+
+    # CuArray (CuVector, CuMatrix, etc.)
+    if val isa CuArray
+        _check_cuda_pointer_overlap(val, pool)
+        return
+    end
+
+    # SubArray / ReshapedArray of CuArray — defensive code.
+    # Current CUDA.jl: view(CuVector, 1:n) returns CuArray via GPUArrays derive(),
+    # NOT SubArray. These branches guard against future CUDA.jl behavior changes
+    # or user-constructed SubArray{T,N,CuArray} / ReshapedArray wrappers.
+    if val isa SubArray
+        p = parent(val)
+        if p isa CuArray
+            _check_cuda_pointer_overlap(p, pool, val)
+        end
+        return
+    end
+
+    if val isa Base.ReshapedArray
+        p = parent(val)
+        if p isa CuArray
+            _check_cuda_pointer_overlap(p, pool, val)
+        elseif p isa SubArray
+            pp = parent(p)
+            if pp isa CuArray
+                _check_cuda_pointer_overlap(pp, pool, val)
+            end
+        end
+        return
+    end
+
+    # Tuple
+    if val isa Tuple
+        for x in val
+            _validate_cuda_return(x, pool)
+        end
+        return
+    end
+
+    # NamedTuple
+    if val isa NamedTuple
+        for x in values(val)
+            _validate_cuda_return(x, pool)
+        end
+        return
+    end
+
+    # Pair
+    if val isa Pair
+        _validate_cuda_return(val.first, pool)
+        _validate_cuda_return(val.second, pool)
+        return
+    end
+
+    # AbstractDict
+    if val isa AbstractDict
+        for p in val
+            _validate_cuda_return(p, pool)
+        end
+        return
+    end
+
+    # AbstractSet
+    if val isa AbstractSet
+        for x in val
+            _validate_cuda_return(x, pool)
+        end
+        return
+    end
+
+    # Array of CuArrays (element recursion for containers)
+    if val isa AbstractArray
+        ET = eltype(val)
+        if !(ET <: Number) && !(ET <: AbstractString) && ET !== Symbol && ET !== Char
+            for x in val
+                _validate_cuda_return(x, pool)
+            end
+        end
+    end
+
+    return
+end
+
+"""
+    _check_cuda_pointer_overlap(arr::CuArray, pool, original_val=arr)
+
+Check if a CuArray's device memory overlaps with any pool backing CuVector.
+Throws `PoolRuntimeEscapeError` on overlap.
+"""
+function _check_cuda_pointer_overlap(arr::CuArray, pool::CuAdaptiveArrayPool, original_val = arr)
+    arr_ptr = UInt(pointer(arr))
+    arr_bytes = length(arr) * sizeof(eltype(arr))
+    arr_end = arr_ptr + arr_bytes
+
+    return_site = let rs = pool._pending_return_site
+        isempty(rs) ? nothing : rs
+    end
+
+    # Check fixed slots
+    AdaptiveArrayPools.foreach_fixed_slot(pool) do tp
+        _check_tp_cuda_overlap(tp, arr_ptr, arr_end, pool, return_site, original_val)
+    end
+
+    # Check others
+    for tp in values(pool.others)
+        _check_tp_cuda_overlap(tp, arr_ptr, arr_end, pool, return_site, original_val)
+    end
+    return
+end
+
+@noinline function _check_tp_cuda_overlap(
+        tp::AbstractTypedPool, arr_ptr::UInt, arr_end::UInt,
+        pool::CuAdaptiveArrayPool, return_site, original_val
+    )
+    for v in tp.vectors
+        v_ptr = UInt(pointer(v))
+        v_bytes = length(v) * sizeof(eltype(v))
+        v_end = v_ptr + v_bytes
+        if !(arr_end <= v_ptr || v_end <= arr_ptr)
+            callsite = _cuda_lookup_borrow_callsite(pool, v)
+            _throw_pool_escape_error(original_val, eltype(v), callsite, return_site)
+        end
+    end
+    return
+end
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/macros.jl b/ext/AdaptiveArrayPoolsCUDAExt/macros.jl
index 54384a1a..62796ec9 100644
--- a/ext/AdaptiveArrayPoolsCUDAExt/macros.jl
+++ b/ext/AdaptiveArrayPoolsCUDAExt/macros.jl
@@ -3,7 +3,7 @@
 # ==============================================================================
 # Enables @with_pool :cuda syntax for GPU memory pooling.
 
-using AdaptiveArrayPools: _get_pool_for_backend
+using AdaptiveArrayPools: _get_pool_for_backend, _dispatch_pool_scope
 
 # ==============================================================================
 # Backend Registration (Val dispatch - zero overhead)
@@ -14,3 +14,23 @@ Register :cuda backend for `@with_pool :cuda` syntax.
 Uses Val dispatch for compile-time resolution and full inlining.
 """
 @inline AdaptiveArrayPools._get_pool_for_backend(::Val{:cuda}) = get_task_local_cuda_pool()
+
+# ==============================================================================
+# Union Splitting for CuAdaptiveArrayPool{S}
+# ==============================================================================
+#
+# The base _dispatch_pool_scope has an `else` fallback for non-CPU pools that
+# passes pool_any without type narrowing. This override provides union splitting
+# for CUDA pools, enabling compile-time S → dead-code elimination of safety branches.
+
+@inline function AdaptiveArrayPools._dispatch_pool_scope(f, pool_any::CuAdaptiveArrayPool)
+    if pool_any isa CuAdaptiveArrayPool{0}
+        return f(pool_any::CuAdaptiveArrayPool{0})
+    elseif pool_any isa CuAdaptiveArrayPool{1}
+        return f(pool_any::CuAdaptiveArrayPool{1})
+    elseif pool_any isa CuAdaptiveArrayPool{2}
+        return f(pool_any::CuAdaptiveArrayPool{2})
+    else
+        return f(pool_any::CuAdaptiveArrayPool{3})
+    end
+end
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/state.jl b/ext/AdaptiveArrayPoolsCUDAExt/state.jl
index 46cd616c..80a28ee2 100644
--- a/ext/AdaptiveArrayPoolsCUDAExt/state.jl
+++ b/ext/AdaptiveArrayPoolsCUDAExt/state.jl
@@ -1,9 +1,10 @@
 # ==============================================================================
 # State Management for CUDA Pools
 # ==============================================================================
-# checkpoint!, rewind!, reset!, empty! implementations for CuAdaptiveArrayPool.
+# checkpoint!, rewind!, reset!, empty! implementations for CuAdaptiveArrayPool{S}.
 # Note: _checkpoint_typed_pool! and _rewind_typed_pool! already work with
 # AbstractTypedPool, so they work for CuTypedPool automatically.
+# S parameter is threaded through rewind paths for compile-time safety dispatch.
 
 using AdaptiveArrayPools: checkpoint!, rewind!, reset!,
     _checkpoint_typed_pool!, _rewind_typed_pool!, _has_bit,
@@ -18,7 +19,7 @@ using AdaptiveArrayPools: checkpoint!, rewind!, reset!,
 
 Apply `f` to each fixed slot CuTypedPool. Zero allocation via compile-time unrolling.
 """
-@generated function AdaptiveArrayPools.foreach_fixed_slot(f::F, pool::CuAdaptiveArrayPool) where {F}
+@generated function AdaptiveArrayPools.foreach_fixed_slot(f::F, pool::CuAdaptiveArrayPool{S}) where {F, S}
     exprs = [:(f(getfield(pool, $(QuoteNode(field))))) for field in GPU_FIXED_SLOT_FIELDS]
     return quote
         Base.@_inline_meta
@@ -61,7 +62,7 @@ end
 end
 
 # Type-specific checkpoint (multiple types)
-@generated function AdaptiveArrayPools.checkpoint!(pool::CuAdaptiveArrayPool, types::Type...)
+@generated function AdaptiveArrayPools.checkpoint!(pool::CuAdaptiveArrayPool{S}, types::Type...) where {S}
     seen = Set{Any}()
     unique_indices = Int[]
     for i in eachindex(types)
@@ -85,7 +86,7 @@ end
 # rewind! for CuAdaptiveArrayPool
 # ==============================================================================
 
-function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool)
+function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool{S}) where {S}
     cur_depth = pool._current_depth
 
     # Safety guard: at global scope (depth=1), delegate to reset!
@@ -94,14 +95,14 @@ function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool)
         return nothing
     end
 
-    # Fixed slots
+    # Fixed slots — pass S for compile-time safety dispatch
     AdaptiveArrayPools.foreach_fixed_slot(pool) do tp
-        _rewind_typed_pool!(tp, cur_depth)
+        _rewind_typed_pool!(tp, cur_depth, S)
     end
 
     # Others
     for tp in values(pool.others)
-        _rewind_typed_pool!(tp, cur_depth)
+        _rewind_typed_pool!(tp, cur_depth, S)
     end
 
     pop!(pool._touched_type_masks)
@@ -112,12 +113,12 @@ function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool)
 end
 
 # Type-specific rewind (single type)
-@inline function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool, ::Type{T}) where {T}
+@inline function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool{S}, ::Type{T}) where {S, T}
     if pool._current_depth == 1
         reset!(AdaptiveArrayPools.get_typed_pool!(pool, T))
         return nothing
     end
-    _rewind_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, T), pool._current_depth)
+    _rewind_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, T), pool._current_depth, S)
     pop!(pool._touched_type_masks)
     pop!(pool._touched_has_others)
     pool._current_depth -= 1
@@ -125,7 +126,7 @@ end
 end
 
 # Type-specific rewind (multiple types)
-@generated function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool, types::Type...)
+@generated function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool{S}, types::Type...) where {S}
     seen = Set{Any}()
     unique_indices = Int[]
     for i in eachindex(types)
@@ -134,7 +135,7 @@ end
             push!(unique_indices, i)
         end
     end
-    rewind_exprs = [:(_rewind_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]), pool._current_depth)) for i in reverse(unique_indices)]
+    rewind_exprs = [:(_rewind_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]), pool._current_depth, S)) for i in reverse(unique_indices)]
     reset_exprs = [:(reset!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]))) for i in unique_indices]
     return quote
         if pool._current_depth == 1
@@ -178,21 +179,21 @@ end
     return nothing
 end
 
-@inline function AdaptiveArrayPools._lazy_rewind!(pool::CuAdaptiveArrayPool)
+@inline function AdaptiveArrayPools._lazy_rewind!(pool::CuAdaptiveArrayPool{S}) where {S}
     d = pool._current_depth
     mask = @inbounds(pool._touched_type_masks[d]) & _TYPE_BITS_MASK
-    _has_bit(mask, Float64)    && _rewind_typed_pool!(pool.float64, d)
-    _has_bit(mask, Float32)    && _rewind_typed_pool!(pool.float32, d)
-    _has_bit(mask, Int64)      && _rewind_typed_pool!(pool.int64, d)
-    _has_bit(mask, Int32)      && _rewind_typed_pool!(pool.int32, d)
-    _has_bit(mask, ComplexF64) && _rewind_typed_pool!(pool.complexf64, d)
-    _has_bit(mask, ComplexF32) && _rewind_typed_pool!(pool.complexf32, d)
-    _has_bit(mask, Bool)       && _rewind_typed_pool!(pool.bool, d)
+    _has_bit(mask, Float64)    && _rewind_typed_pool!(pool.float64, d, S)
+    _has_bit(mask, Float32)    && _rewind_typed_pool!(pool.float32, d, S)
+    _has_bit(mask, Int64)      && _rewind_typed_pool!(pool.int64, d, S)
+    _has_bit(mask, Int32)      && _rewind_typed_pool!(pool.int32, d, S)
+    _has_bit(mask, ComplexF64) && _rewind_typed_pool!(pool.complexf64, d, S)
+    _has_bit(mask, ComplexF32) && _rewind_typed_pool!(pool.complexf32, d, S)
+    _has_bit(mask, Bool)       && _rewind_typed_pool!(pool.bool, d, S)
     # Bit 7: Float16 (CUDA reassignment — _fixed_slot_bit(Float16)==0, must use explicit bit check)
-    mask & _cuda_float16_bit() != 0 && _rewind_typed_pool!(pool.float16, d)
+    mask & _cuda_float16_bit() != 0 && _rewind_typed_pool!(pool.float16, d, S)
     if @inbounds(pool._touched_has_others[d])
         for tp in values(pool.others)
-            _rewind_typed_pool!(tp, d)
+            _rewind_typed_pool!(tp, d, S)
         end
     end
     pop!(pool._touched_type_masks)
@@ -228,17 +229,17 @@ end
 # Uses direct field access with bit checks — foreach_fixed_slot is single-argument (no bit yield).
 # Bit 7: Float16 (CUDA-specific; lazy-checkpointed on first touch by _record_type_touch!).
 # has_others: genuine others types (UInt8, Int8, etc.) — eagerly checkpointed at scope entry.
-@inline function AdaptiveArrayPools._typed_lazy_rewind!(pool::CuAdaptiveArrayPool, tracked_mask::UInt16)
+@inline function AdaptiveArrayPools._typed_lazy_rewind!(pool::CuAdaptiveArrayPool{S}, tracked_mask::UInt16) where {S}
     d = pool._current_depth
     touched = @inbounds(pool._touched_type_masks[d]) & _TYPE_BITS_MASK
     combined = tracked_mask | touched
-    _has_bit(combined, Float64)    && _rewind_typed_pool!(pool.float64, d)
-    _has_bit(combined, Float32)    && _rewind_typed_pool!(pool.float32, d)
-    _has_bit(combined, Int64)      && _rewind_typed_pool!(pool.int64, d)
-    _has_bit(combined, Int32)      && _rewind_typed_pool!(pool.int32, d)
-    _has_bit(combined, ComplexF64) && _rewind_typed_pool!(pool.complexf64, d)
-    _has_bit(combined, ComplexF32) && _rewind_typed_pool!(pool.complexf32, d)
-    _has_bit(combined, Bool)       && _rewind_typed_pool!(pool.bool, d)
+    _has_bit(combined, Float64)    && _rewind_typed_pool!(pool.float64, d, S)
+    _has_bit(combined, Float32)    && _rewind_typed_pool!(pool.float32, d, S)
+    _has_bit(combined, Int64)      && _rewind_typed_pool!(pool.int64, d, S)
+    _has_bit(combined, Int32)      && _rewind_typed_pool!(pool.int32, d, S)
+    _has_bit(combined, ComplexF64) && _rewind_typed_pool!(pool.complexf64, d, S)
+    _has_bit(combined, ComplexF32) && _rewind_typed_pool!(pool.complexf32, d, S)
+    _has_bit(combined, Bool)       && _rewind_typed_pool!(pool.bool, d, S)
     # Float16: bit 7 is set by _record_type_touch! on first touch (lazy first-touch).
     # Also rewind when Float16 was a *tracked* type in the macro: _typed_lazy_checkpoint!
     # calls checkpoint!(pool, Float16) which pushes a checkpoint at depth d, but _acquire_impl!
@@ -247,11 +248,11 @@ end
     # tracked_mask carries no bit for Float16 either.
     # Solution: check _checkpoint_depths to detect "Float16 was checkpointed at this depth".
     if combined & _cuda_float16_bit() != 0 || @inbounds(pool.float16._checkpoint_depths[end]) == d
-        _rewind_typed_pool!(pool.float16, d)
+        _rewind_typed_pool!(pool.float16, d, S)
     end
     if @inbounds(pool._touched_has_others[d])
         for tp in values(pool.others)
-            _rewind_typed_pool!(tp, d)
+            _rewind_typed_pool!(tp, d, S)
         end
     end
     pop!(pool._touched_type_masks)
@@ -282,6 +283,11 @@ function AdaptiveArrayPools.reset!(pool::CuAdaptiveArrayPool)
     empty!(pool._touched_has_others)
     push!(pool._touched_has_others, false)         # Sentinel: no others
 
+    # Reset borrow tracking state
+    pool._pending_callsite = ""
+    pool._pending_return_site = ""
+    pool._borrow_log = nothing
+
     return pool
 end
 
@@ -341,5 +347,10 @@ function Base.empty!(pool::CuAdaptiveArrayPool)
     empty!(pool._touched_has_others)
     push!(pool._touched_has_others, false)         # Sentinel: no others
 
+    # Reset borrow tracking state
+    pool._pending_callsite = ""
+    pool._pending_return_site = ""
+    pool._borrow_log = nothing
+
     return pool
 end
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl b/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl
index 60da07f6..4ab507dd 100644
--- a/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl
+++ b/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl
@@ -2,11 +2,12 @@
 # Task-Local CUDA Pool (Multi-Device Aware)
 # ==============================================================================
 # Each Task gets one pool per GPU device to prevent cross-device memory access.
+# Pools are parameterized by safety level S (CuAdaptiveArrayPool{S}).
 
 const _CU_POOL_KEY = :ADAPTIVE_ARRAY_POOL_CUDA
 
 """
-    get_task_local_cuda_pool() -> CuAdaptiveArrayPool
+    get_task_local_cuda_pool() -> CuAdaptiveArrayPool{S}
 
 Retrieves (or creates) the `CuAdaptiveArrayPool` for the current Task and current GPU device.
 
@@ -18,6 +19,7 @@ a dictionary of pools (one per device) in task-local storage, ensuring that:
 
 ## Implementation
 Uses `Dict{Int, CuAdaptiveArrayPool}` in task-local storage, keyed by device ID.
+Values are `CuAdaptiveArrayPool{S}` — use `_dispatch_pool_scope` for union splitting.
 """
 @inline function AdaptiveArrayPools.get_task_local_cuda_pool()
     # 1. Get or create the pools dictionary
@@ -33,7 +35,7 @@ Uses `Dict{Int, CuAdaptiveArrayPool}` in task-local storage, keyed by device ID.
     # 3. Get or create pool for this device
     pool = get(pools, dev_id, nothing)
     if pool === nothing
-        pool = CuAdaptiveArrayPool()  # Constructor captures device_id
+        pool = CuAdaptiveArrayPool()  # Constructor uses POOL_SAFETY_LV[]
         pools[dev_id] = pool
     end
 
@@ -52,5 +54,34 @@ Useful for diagnostics or bulk operations across all devices.
         pools = Dict{Int, CuAdaptiveArrayPool}()
         task_local_storage(_CU_POOL_KEY, pools)
     end
-    return pools::Dict{Int, CuAdaptiveArrayPool}
+    return pools
+end
+
+# ==============================================================================
+# Safety Level Hook (called from set_safety_level! in base)
+# ==============================================================================
+
+function AdaptiveArrayPools._set_cuda_safety_level_hook!(level::Int)
+    pools = get(task_local_storage(), _CU_POOL_KEY, nothing)
+    pools === nothing && return nothing
+
+    # Check that no pool is inside an active scope
+    for (dev_id, old_pool) in pools
+        old = old_pool::CuAdaptiveArrayPool
+        depth = old._current_depth
+        depth != 1 && throw(
+            ArgumentError(
+                "set_safety_level! cannot be called inside an active @with_pool :cuda scope " *
+                    "(device=$dev_id, depth=$depth)"
+            )
+        )
+    end
+
+    # Replace all pools (collect keys to avoid mutating Dict during iteration)
+    for dev_id in collect(keys(pools))
+        old = pools[dev_id]::CuAdaptiveArrayPool
+        pools[dev_id] = _make_cuda_pool(level, old)
+    end
+
+    return nothing
 end
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/types.jl b/ext/AdaptiveArrayPoolsCUDAExt/types.jl
index 4e5dd9fc..90f40741 100644
--- a/ext/AdaptiveArrayPoolsCUDAExt/types.jl
+++ b/ext/AdaptiveArrayPoolsCUDAExt/types.jl
@@ -83,9 +83,16 @@ const GPU_FIXED_SLOT_FIELDS = (
 # ==============================================================================
 
 """
-    CuAdaptiveArrayPool <: AbstractArrayPool
+    CuAdaptiveArrayPool{S} <: AbstractArrayPool
 
-Multi-type GPU memory pool. Task-local and device-specific.
+Multi-type GPU memory pool, parameterized by safety level `S` (0–3).
+
+## Safety Levels (CUDA-specific)
+- `S=0`: Zero overhead — all safety branches eliminated by dead-code elimination
+- `S=1`: Guard — poisoning (NaN/sentinel fill on released vectors) + cache invalidation
+         (CUDA equivalent of CPU's resize! structural invalidation)
+- `S=2`: Full — poisoning + escape detection (`_validate_pool_return`)
+- `S=3`: Debug — full + borrow call-site registry + debug messages
 
 ## Device Safety
 Each pool is bound to a specific GPU device. Using a pool on the wrong device
@@ -95,8 +102,9 @@ causes undefined behavior. The `device_id` field tracks ownership.
 - Fixed slots for common GPU types (Float32 priority, includes Float16)
 - `others`: IdDict fallback for rare types
 - `device_id`: The GPU device this pool belongs to
+- Borrow tracking fields (required by macro-injected field access at all S levels)
 """
-mutable struct CuAdaptiveArrayPool <: AbstractArrayPool
+mutable struct CuAdaptiveArrayPool{S} <: AbstractArrayPool
     # Fixed Slots (GPU-optimized order)
     float32::CuTypedPool{Float32}
     float64::CuTypedPool{Float64}
@@ -117,11 +125,16 @@ mutable struct CuAdaptiveArrayPool <: AbstractArrayPool
 
     # Device tracking (safety)
     device_id::Int
+
+    # Borrow tracking (required: macro injects pool._pending_callsite = "..." as raw AST)
+    _pending_callsite::String
+    _pending_return_site::String
+    _borrow_log::Union{Nothing, IdDict{Any, String}}
 end
 
-function CuAdaptiveArrayPool()
+function CuAdaptiveArrayPool{S}() where {S}
     dev = CUDA.device()
-    return CuAdaptiveArrayPool(
+    return CuAdaptiveArrayPool{S}(
         CuTypedPool{Float32}(),
         CuTypedPool{Float64}(),
         CuTypedPool{Float16}(),
@@ -134,6 +147,78 @@ function CuAdaptiveArrayPool()
         1,              # _current_depth (1 = global scope)
         [UInt16(0)],    # _touched_type_masks: sentinel (no bits set)
         [false],        # _touched_has_others: sentinel (no others)
-        CUDA.deviceid(dev)  # Use public API
+        CUDA.deviceid(dev),
+        "",             # _pending_callsite
+        "",             # _pending_return_site
+        nothing         # _borrow_log: lazily created at S >= 3
+    )
+end
+
+"""Create pool at the current `POOL_SAFETY_LV[]` level."""
+CuAdaptiveArrayPool() = _make_cuda_pool(AdaptiveArrayPools.POOL_SAFETY_LV[])
+
+# ==============================================================================
+# Safety Level Dispatch
+# ==============================================================================
+
+"""
+    _safety_level(pool::CuAdaptiveArrayPool{S}) -> Int
+
+Return compile-time constant safety level for CUDA pools.
+"""
+@inline AdaptiveArrayPools._safety_level(::CuAdaptiveArrayPool{S}) where {S} = S
+
+"""
+    _make_cuda_pool(s::Int) -> CuAdaptiveArrayPool{s}
+
+Function barrier: converts runtime `Int` to concrete `CuAdaptiveArrayPool{S}`.
+Levels outside 0-3 are clamped (≤0 → 0, ≥3 → 3).
+"""
+@noinline function _make_cuda_pool(s::Int)
+    s <= 0 && return CuAdaptiveArrayPool{0}()
+    s == 1 && return CuAdaptiveArrayPool{1}()
+    s == 2 && return CuAdaptiveArrayPool{2}()
+    return CuAdaptiveArrayPool{3}()
+end
+
+"""
+    _make_cuda_pool(s::Int, old::CuAdaptiveArrayPool) -> CuAdaptiveArrayPool{s}
+
+Create a new pool at safety level `s`, transferring cached arrays and scope state
+from `old`. Only reference copies — no memory allocation for underlying GPU buffers.
+
+Transferred: all CuTypedPool slots, `others`, depth & touch tracking, device_id.
+Reset: `_pending_callsite/return_site` (transient macro state),
+       `_borrow_log` (created fresh when `s >= 3`).
+"""
+@noinline function _make_cuda_pool(s::Int, old::CuAdaptiveArrayPool)
+    s <= 0 && return _transfer_cuda_pool(Val(0), old)
+    s == 1 && return _transfer_cuda_pool(Val(1), old)
+    s == 2 && return _transfer_cuda_pool(Val(2), old)
+    return _transfer_cuda_pool(Val(3), old)
+end
+
+"""Transfer cached arrays and scope state from `old` pool into a new `CuAdaptiveArrayPool{V}`."""
+function _transfer_cuda_pool(::Val{V}, old::CuAdaptiveArrayPool) where {V}
+    return CuAdaptiveArrayPool{V}(
+        old.float32, old.float64, old.float16,
+        old.int32, old.int64,
+        old.complexf32, old.complexf64, old.bool,
+        old.others,
+        old._current_depth,
+        old._touched_type_masks,
+        old._touched_has_others,
+        old.device_id,
+        "",       # _pending_callsite: reset
+        "",       # _pending_return_site: reset
+        V >= 3 ? IdDict{Any, String}() : nothing  # _borrow_log
     )
 end
+
+"""Human-readable safety level label."""
+function _cuda_safety_label(s::Int)
+    s <= 0 && return "off"
+    s == 1 && return "guard"
+    s == 2 && return "full"
+    return "debug"
+end
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/utils.jl b/ext/AdaptiveArrayPoolsCUDAExt/utils.jl
index c8c22556..18d40512 100644
--- a/ext/AdaptiveArrayPoolsCUDAExt/utils.jl
+++ b/ext/AdaptiveArrayPoolsCUDAExt/utils.jl
@@ -57,11 +57,14 @@ end
 
 Print statistics for a CUDA adaptive array pool.
 """
-function AdaptiveArrayPools.pool_stats(pool::CuAdaptiveArrayPool; io::IO = stdout)
-    # Header with device info
+function AdaptiveArrayPools.pool_stats(pool::CuAdaptiveArrayPool{S}; io::IO = stdout) where {S}
+    # Header with device info and safety level
     printstyled(io, "CuAdaptiveArrayPool", bold = true, color = :green)
+    printstyled(io, "{$S}", color = :yellow)
     printstyled(io, " (device ", color = :dark_gray)
     printstyled(io, pool.device_id, color = :blue)
+    printstyled(io, ", safety=", color = :dark_gray)
+    printstyled(io, _cuda_safety_label(S), color = :yellow)
     printstyled(io, ")\n", color = :dark_gray)
 
     has_content = false
@@ -112,7 +115,7 @@ end
 # ==============================================================================
 
 # Compact one-line show
-function Base.show(io::IO, pool::CuAdaptiveArrayPool)
+function Base.show(io::IO, pool::CuAdaptiveArrayPool{S}) where {S}
     n_types = Ref(0)
     total_vectors = Ref(0)
     total_active = Ref(0)
@@ -131,7 +134,7 @@ function Base.show(io::IO, pool::CuAdaptiveArrayPool)
         total_active[] += tp.n_active
     end
 
-    return print(io, "CuAdaptiveArrayPool(device=$(pool.device_id), types=$(n_types[]), slots=$(total_vectors[]), active=$(total_active[]))")
+    return print(io, "CuAdaptiveArrayPool{$S}(safety=$(_cuda_safety_label(S)), device=$(pool.device_id), types=$(n_types[]), slots=$(total_vectors[]), active=$(total_active[]))")
 end
 
 # Multi-line show
diff --git a/src/debug.jl b/src/debug.jl
index 256a1ac6..83eed243 100644
--- a/src/debug.jl
+++ b/src/debug.jl
@@ -241,7 +241,7 @@ function _validate_pool_return(val::AbstractSet, pool::AdaptiveArrayPool)
 end
 
 _validate_pool_return(val, ::DisabledPool) = nothing
-# No-op fallback for non-CPU pools (e.g. CuAdaptiveArrayPool) that lack borrow tracking fields
+# No-op fallback for pool types without specific validation (overridden by CUDA extension)
 _validate_pool_return(val, ::AbstractArrayPool) = nothing
 
 # ==============================================================================
diff --git a/src/task_local_pool.jl b/src/task_local_pool.jl
index 06b8a9ed..d316a703 100644
--- a/src/task_local_pool.jl
+++ b/src/task_local_pool.jl
@@ -144,22 +144,18 @@ end
 """
     set_safety_level!(level::Int) -> AdaptiveArrayPool
 
-Replace the task-local pool with a new `AdaptiveArrayPool{level}`,
-preserving all cached arrays and scope state from the old pool.
+Replace the task-local CPU pool (and CUDA pools if CUDA.jl is loaded)
+with new pools at the given safety level, preserving cached arrays
+and scope state (zero-copy transfer).
 
-Cached TypedPool/BitTypedPool slots, the `others` IdDict, depth tracking,
-and touch masks are transferred by reference (zero copy).
-Transient borrow-tracking state (`_pending_callsite`, `_borrow_log`) is reset.
-
-One-time JIT cost for new `S` specialization.
-Also updates `POOL_SAFETY_LV[]` so that `AdaptiveArrayPool()` creates pools
-at the new level.
+Also updates `POOL_SAFETY_LV[]` so that future `AdaptiveArrayPool()` /
+`CuAdaptiveArrayPool()` constructors use the new level.
 
 ## Example
 ```julia
-set_safety_level!(2)  # Enable full safety (escape detection + poisoning)
+set_safety_level!(2)  # Enable full safety on CPU + all GPU devices
 # ... run suspicious code ...
-set_safety_level!(0)  # Back to zero overhead — cached arrays still available
+set_safety_level!(0)  # Back to zero overhead everywhere
 ```
 
 See also: [`_safety_level`], [`POOL_SAFETY_LV`]
@@ -178,9 +174,14 @@ function set_safety_level!(level::Int)
     POOL_SAFETY_LV[] = level
     new_pool = old_pool === nothing ? _make_pool(level) : _make_pool(level, old_pool::AdaptiveArrayPool)
     task_local_storage(_POOL_KEY, new_pool)
+    # Update CUDA pools if extension is loaded (no-op otherwise)
+    _set_cuda_safety_level_hook!(level)
     return new_pool
 end
 
+# Hook for CUDA extension to override. No-op when CUDA is not loaded.
+_set_cuda_safety_level_hook!(::Int) = nothing
+
 # ==============================================================================
 # CUDA Pool Stubs (overridden by extension when CUDA is loaded)
 # ==============================================================================
diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl
index cc2fb158..2c016a50 100644
--- a/test/cuda/runtests.jl
+++ b/test/cuda/runtests.jl
@@ -41,4 +41,5 @@ else
     include("test_display.jl")
     include("test_convenience.jl")
     include("test_disabled_pool.jl")
+    include("test_cuda_safety.jl")
 end
diff --git a/test/cuda/test_cuda_safety.jl b/test/cuda/test_cuda_safety.jl
new file mode 100644
index 00000000..a5fbc5d4
--- /dev/null
+++ b/test/cuda/test_cuda_safety.jl
@@ -0,0 +1,659 @@
+import AdaptiveArrayPools: PoolRuntimeEscapeError, PoolEscapeError, _safety_level, POOL_DEBUG
+
+const _make_cuda_pool = ext._make_cuda_pool
+
+# Opaque identity — defeats compile-time escape analysis
+_cuda_test_leak(x) = x
+
+@testset "CUDA Safety Dispatch (CuAdaptiveArrayPool{S})" begin
+
+    # ==============================================================================
+    # Type parameterization basics
+    # ==============================================================================
+
+    @testset "CuAdaptiveArrayPool{S} construction and _safety_level" begin
+        p0 = _make_cuda_pool(0)
+        p1 = _make_cuda_pool(1)
+        p2 = _make_cuda_pool(2)
+        p3 = _make_cuda_pool(3)
+
+        @test p0 isa CuAdaptiveArrayPool{0}
+        @test p1 isa CuAdaptiveArrayPool{1}
+        @test p2 isa CuAdaptiveArrayPool{2}
+        @test p3 isa CuAdaptiveArrayPool{3}
+
+        @test _safety_level(p0) == 0
+        @test _safety_level(p1) == 1
+        @test _safety_level(p2) == 2
+        @test _safety_level(p3) == 3
+
+        # Borrow fields exist at all levels (required by macro-injected field access)
+        @test hasfield(typeof(p0), :_pending_callsite)
+        @test hasfield(typeof(p0), :_pending_return_site)
+        @test hasfield(typeof(p0), :_borrow_log)
+    end
+
+    # ==============================================================================
+    # Level 0: No poisoning, no validation
+    # ==============================================================================
+
+    @testset "Level 0: no poisoning on rewind" begin
+        pool = _make_cuda_pool(0)
+        checkpoint!(pool)
+        v = acquire!(pool, Float32, 10)
+        CUDA.fill!(v, 42.0f0)
+        rewind!(pool)
+
+        # With safety off, backing vector still has valid data
+        @test length(pool.float32.vectors[1]) >= 10
+        # Data should still be there (no poisoning)
+        checkpoint!(pool)
+        v2 = acquire!(pool, Float32, 10)
+        @test all(x -> x == 42.0f0, Array(v2))
+        rewind!(pool)
+    end
+
+    # ==============================================================================
+    # Level 1: Poisoning (CUDA equivalent of CPU's resize! invalidation)
+    # ==============================================================================
+
+    @testset "Level 1: Float32 poisoned with NaN on rewind" begin
+        pool = _make_cuda_pool(1)
+        checkpoint!(pool)
+        v = acquire!(pool, Float32, 10)
+        CUDA.fill!(v, 42.0f0)
+        rewind!(pool)
+
+        # Backing vector should be poisoned with NaN (NOT freed via resize!)
+        @test length(pool.float32.vectors[1]) >= 10
+        cpu_data = Array(pool.float32.vectors[1])
+        @test all(isnan, cpu_data[1:10])
+
+        # Re-acquire: should see poisoned data
+        checkpoint!(pool)
+        v2 = acquire!(pool, Float32, 10)
+        @test all(isnan, Array(v2))
+        rewind!(pool)
+    end
+
+    @testset "Level 1: Int32 poisoned with typemax on rewind" begin
+        pool = _make_cuda_pool(1)
+        checkpoint!(pool)
+        v = acquire!(pool, Int32, 8)
+        CUDA.fill!(v, Int32(42))
+        rewind!(pool)
+
+        cpu_data = Array(pool.int32.vectors[1])
+        @test all(==(typemax(Int32)), cpu_data[1:8])
+    end
+
+    @testset "Level 1: ComplexF32 poisoned with NaN on rewind" begin
+        pool = _make_cuda_pool(1)
+        checkpoint!(pool)
+        v = acquire!(pool, ComplexF32, 8)
+        CUDA.fill!(v, ComplexF32(1.0f0 + 2.0f0im))
+        rewind!(pool)
+
+        cpu_data = Array(pool.complexf32.vectors[1])
+        @test all(z -> isnan(real(z)) && isnan(imag(z)), cpu_data[1:8])
+    end
+
+    @testset "Level 1: Bool poisoned with true on rewind" begin
+        pool = _make_cuda_pool(1)
+        checkpoint!(pool)
+        v = acquire!(pool, Bool, 16)
+        CUDA.fill!(v, false)
+        rewind!(pool)
+
+        cpu_data = Array(pool.bool.vectors[1])
+        @test all(==(true), cpu_data[1:16])
+    end
+
+    @testset "Level 1: Float16 poisoned with NaN on rewind" begin
+        pool = _make_cuda_pool(1)
+        checkpoint!(pool)
+        v = acquire!(pool, Float16, 10)
+        CUDA.fill!(v, Float16(42.0))
+        rewind!(pool)
+
+        cpu_data = Array(pool.float16.vectors[1])
+        @test all(isnan, cpu_data[1:10])
+    end
+
+    @testset "Level 1: N-way cache invalidated on poisoned rewind" begin
+        pool = _make_cuda_pool(1)
+        checkpoint!(pool)
+        v = acquire!(pool, Float32, 10)
+        CUDA.fill!(v, 1.0f0)
+        rewind!(pool)
+
+        # Cached views should be cleared (nothing) after poisoning
+        base = 0 * ext.CACHE_WAYS
+        for k in 1:ext.CACHE_WAYS
+            @test pool.float32.views[base + k] === nothing
+        end
+    end
+
+    @testset "Level 1: no escape detection" begin
+        # Level 1 should NOT throw on escape (that's Level 2+)
+        pool = _make_cuda_pool(1)
+        result = begin
+            checkpoint!(pool)
+            v = acquire!(pool, Float32, 10)
+            rewind!(pool)
+            v  # "escaping" — should not throw at Level 1
+        end
+        @test result isa CuArray
+    end
+
+    # ==============================================================================
+    # Level 0: Verify no poisoning
+    # ==============================================================================
+
+    @testset "Level 0: no poisoning (verify data survives rewind)" begin
+        pool = _make_cuda_pool(0)
+        checkpoint!(pool)
+        v = acquire!(pool, Float32, 10)
+        CUDA.fill!(v, 42.0f0)
+        rewind!(pool)
+
+        # Data should NOT be poisoned at Level 0
+        cpu_data = Array(pool.float32.vectors[1])
+        @test all(x -> x == 42.0f0, cpu_data[1:10])
+    end
+
+    # ==============================================================================
+    # Level 2: Escape detection
+    # ==============================================================================
+
+    @testset "Level 2: escape detection catches CuArray leak" begin
+        pool = _make_cuda_pool(2)
+        @test_throws PoolRuntimeEscapeError begin
+            checkpoint!(pool)
+            try
+                v = acquire!(pool, Float32, 10)
+                # Simulate what _validate_pool_return does
+                AdaptiveArrayPools._validate_pool_return(_cuda_test_leak(v), pool)
+            finally
+                rewind!(pool)
+            end
+        end
+    end
+
+    @testset "Level 2: safe scalar return does not throw" begin
+        pool = _make_cuda_pool(2)
+        checkpoint!(pool)
+        try
+            v = acquire!(pool, Float32, 10)
+            CUDA.fill!(v, 3.0f0)
+            result = sum(Array(v))  # scalar — safe
+            AdaptiveArrayPools._validate_pool_return(result, pool)
+            @test result == 30.0f0
+        finally
+            rewind!(pool)
+        end
+    end
+
+    @testset "Level 2: escape detection with Tuple containing CuArray" begin
+        pool = _make_cuda_pool(2)
+        @test_throws PoolRuntimeEscapeError begin
+            checkpoint!(pool)
+            try
+                v = acquire!(pool, Float32, 10)
+                val = (42, _cuda_test_leak(v))
+                AdaptiveArrayPools._validate_pool_return(val, pool)
+            finally
+                rewind!(pool)
+            end
+        end
+    end
+
+    @testset "Level 2: escape detection with Dict containing CuArray" begin
+        pool = _make_cuda_pool(2)
+        @test_throws PoolRuntimeEscapeError begin
+            checkpoint!(pool)
+            try
+                v = acquire!(pool, Float32, 10)
+                val = Dict(:data => _cuda_test_leak(v))
+                AdaptiveArrayPools._validate_pool_return(val, pool)
+            finally
+                rewind!(pool)
+            end
+        end
+    end
+
+    @testset "Level 0 and 1: no escape detection" begin
+        for lv in (0, 1)
+            pool = _make_cuda_pool(lv)
+            checkpoint!(pool)
+            try
+                v = acquire!(pool, Float32, 10)
+                # Should NOT throw — escape detection requires Level 2+
+                AdaptiveArrayPools._validate_pool_return(_cuda_test_leak(v), pool)
+            finally
+                rewind!(pool)
+            end
+        end
+    end
+
+    # ==============================================================================
+    # Level 3: Borrow tracking
+    # ==============================================================================
+
+    @testset "Level 3: borrow fields functional" begin
+        pool = _make_cuda_pool(3)
+        @test pool._pending_callsite == ""
+        @test pool._pending_return_site == ""
+        @test pool._borrow_log === nothing  # lazily created
+    end
+
+    @testset "Level 3: _set_pending_callsite! works" begin
+        pool = _make_cuda_pool(3)
+        AdaptiveArrayPools._set_pending_callsite!(pool, "test.jl:42\nacquire!(pool, Float32, 10)")
+        @test pool._pending_callsite == "test.jl:42\nacquire!(pool, Float32, 10)"
+
+        # At Level 0, should be no-op
+        pool0 = _make_cuda_pool(0)
+        AdaptiveArrayPools._set_pending_callsite!(pool0, "should not be set")
+        @test pool0._pending_callsite == ""
+    end
+
+    @testset "Level 3: _maybe_record_borrow! records callsite" begin
+        pool = _make_cuda_pool(3)
+        checkpoint!(pool)
+        tp = get_typed_pool!(pool, Float32)
+
+        # Set pending callsite, then acquire to increment n_active
+        AdaptiveArrayPools._set_pending_callsite!(pool, "test.jl:99\nacquire!(pool, Float32, 5)")
+        acquire!(pool, Float32, 5)
+
+        # The borrow log should now have an entry
+        @test pool._borrow_log !== nothing
+        @test length(pool._borrow_log) >= 1
+
+        rewind!(pool)
+    end
+
+    # ==============================================================================
+    # set_safety_level! — all-device replacement
+    # ==============================================================================
+
+    @testset "set_safety_level! replaces pool with state preservation" begin
+        # Get current pool (creates one at default safety level)
+        pool = get_task_local_cuda_pool()
+        reset!(pool)
+
+        # Populate with some data
+        checkpoint!(pool)
+        v = acquire!(pool, Float32, 100)
+        CUDA.fill!(v, 1.0f0)
+        rewind!(pool)
+
+        # Change safety level
+        set_safety_level!(2)
+        new_pool = get_task_local_cuda_pool()
+
+        @test new_pool isa CuAdaptiveArrayPool{2}
+        @test _safety_level(new_pool) == 2
+        # Cached vectors should be preserved (same object reference)
+        @test new_pool.float32.vectors[1] === pool.float32.vectors[1]
+
+        # Restore
+        set_safety_level!(0)
+        @test get_task_local_cuda_pool() isa CuAdaptiveArrayPool{0}
+    end
+
+    @testset "set_safety_level! rejects inside active scope" begin
+        pool = get_task_local_cuda_pool()
+        checkpoint!(pool)
+        try
+            @test_throws ArgumentError set_safety_level!(2)
+        finally
+            rewind!(pool)
+        end
+    end
+
+    # ==============================================================================
+    # Nested scopes: inner poisoned, outer valid
+    # ==============================================================================
+
+    @testset "Nested scopes: inner poisoned, outer still valid" begin
+        pool = _make_cuda_pool(1)
+
+        checkpoint!(pool)
+        v_outer = acquire!(pool, Float32, 10)
+        CUDA.fill!(v_outer, 1.0f0)
+
+        # Inner scope
+        checkpoint!(pool)
+        v_inner = acquire!(pool, Float32, 20)
+        CUDA.fill!(v_inner, 2.0f0)
+        rewind!(pool)
+
+        # Inner should be poisoned (slot 2 released)
+        cpu_inner = Array(pool.float32.vectors[2])
+        @test all(isnan, cpu_inner[1:20])
+
+        # Outer should still be valid (slot 1 not released)
+        cpu_outer = Array(v_outer)
+        @test all(x -> x == 1.0f0, cpu_outer)
+
+        rewind!(pool)
+        # Now outer is also poisoned
+        cpu_outer_after = Array(pool.float32.vectors[1])
+        @test all(isnan, cpu_outer_after[1:10])
+    end
+
+    # ==============================================================================
+    # reset! with safety
+    # ==============================================================================
+
+    @testset "reset! clears borrow tracking state" begin
+        pool = _make_cuda_pool(3)
+        pool._pending_callsite = "test"
+        pool._pending_return_site = "test"
+        pool._borrow_log = IdDict{Any, String}()
+
+        reset!(pool)
+
+        @test pool._pending_callsite == ""
+        @test pool._pending_return_site == ""
+        @test pool._borrow_log === nothing
+    end
+
+    # ==============================================================================
+    # Display includes {S} and safety label
+    # ==============================================================================
+
+    @testset "show includes {S} and safety label" begin
+        pool = _make_cuda_pool(2)
+        s = sprint(show, pool)
+        @test occursin("{2}", s)
+        @test occursin("safety=full", s)
+
+        pool0 = _make_cuda_pool(0)
+        s0 = sprint(show, pool0)
+        @test occursin("{0}", s0)
+        @test occursin("safety=off", s0)
+    end
+
+    # ==============================================================================
+    # POOL_DEBUG backward compat with CUDA
+    # ==============================================================================
+
+    @testset "POOL_DEBUG backward compat triggers CUDA escape detection" begin
+        old_debug = POOL_DEBUG[]
+
+        POOL_DEBUG[] = true
+        pool = _make_cuda_pool(0)  # Safety off, but POOL_DEBUG overrides
+        @test_throws PoolRuntimeEscapeError begin
+            checkpoint!(pool)
+            try
+                v = acquire!(pool, Float32, 10)
+                AdaptiveArrayPools._validate_pool_return(_cuda_test_leak(v), pool)
+            finally
+                rewind!(pool)
+            end
+        end
+
+        POOL_DEBUG[] = old_debug
+    end
+
+    # ==============================================================================
+    # Fallback types (pool.others) poisoning
+    # ==============================================================================
+
+    @testset "Fallback type (UInt8) poisoned on rewind" begin
+        pool = _make_cuda_pool(1)
+        checkpoint!(pool)
+        v = acquire!(pool, UInt8, 16)
+        CUDA.fill!(v, UInt8(42))
+        rewind!(pool)
+
+        tp = pool.others[UInt8]
+        cpu_data = Array(tp.vectors[1])
+        @test all(==(typemax(UInt8)), cpu_data[1:16])
+    end
+
+    # ==============================================================================
+    # @with_pool :cuda integration with safety
+    # ==============================================================================
+
+    @testset "@with_pool :cuda with escape detection" begin
+        old_debug = POOL_DEBUG[]
+        POOL_DEBUG[] = true  # Use POOL_DEBUG to trigger on any safety level
+
+        @test_throws PoolRuntimeEscapeError @with_pool :cuda pool begin
+            v = acquire!(pool, Float32, 10)
+            _cuda_test_leak(v)
+        end
+
+        POOL_DEBUG[] = old_debug
+    end
+
+    @testset "@with_pool :cuda safe return" begin
+        old_debug = POOL_DEBUG[]
+        POOL_DEBUG[] = true
+
+        result = @with_pool :cuda pool begin
+            v = acquire!(pool, Float32, 10)
+            CUDA.fill!(v, 3.0f0)
+            sum(Array(v))  # scalar return — safe
+        end
+        @test result == 30.0f0
+
+        POOL_DEBUG[] = old_debug
+    end
+
+    # ==============================================================================
+    # Compile-time escape detection (@with_pool :cuda)
+    # ==============================================================================
+
+    @testset "Compile-time: direct CuArray escape caught at macro expansion" begin
+        @test_throws PoolEscapeError @macroexpand @with_pool :cuda pool begin
+            v = acquire!(pool, Float32, 10)
+            v  # direct escape in tail position
+        end
+    end
+
+    @testset "Compile-time: safe scalar return passes" begin
+        # Should NOT throw at macro expansion time
+        ex = @macroexpand @with_pool :cuda pool begin
+            v = acquire!(pool, Float32, 10)
+            sum(Array(v))  # scalar — safe
+        end
+        @test ex isa Expr
+    end
+
+    @testset "Compile-time: zeros!/ones! escape caught" begin
+        @test_throws PoolEscapeError @macroexpand @with_pool :cuda pool begin
+            v = zeros!(pool, Float32, 10)
+            v
+        end
+    end
+
+    # ==============================================================================
+    # @with_pool :cuda at native Level 2 (no POOL_DEBUG hack)
+    # ==============================================================================
+
+    @testset "@with_pool :cuda Level 2 escape detection (native S=2)" begin
+        set_safety_level!(2)
+
+        @test_throws PoolRuntimeEscapeError @with_pool :cuda pool begin
+            v = acquire!(pool, Float32, 10)
+            _cuda_test_leak(v)
+        end
+
+        set_safety_level!(0)
+    end
+
+    @testset "@with_pool :cuda Level 2 safe return (native S=2)" begin
+        set_safety_level!(2)
+
+        result = @with_pool :cuda pool begin
+            v = acquire!(pool, Float32, 10)
+            CUDA.fill!(v, 5.0f0)
+            sum(Array(v))
+        end
+        @test result == 50.0f0
+
+        set_safety_level!(0)
+    end
+
+    @testset "@with_pool :cuda Level 1 no escape detection (native S=1)" begin
+        set_safety_level!(1)
+
+        # Level 1 should NOT trigger escape detection
+        result = @with_pool :cuda pool begin
+            v = acquire!(pool, Float32, 10)
+            _cuda_test_leak(v)
+        end
+        @test result isa CuArray
+
+        set_safety_level!(0)
+    end
+
+    # ==============================================================================
+    # Level 3 borrow tracking via macro path
+    # ==============================================================================
+
+    @testset "@with_pool :cuda Level 3 escape error includes callsite" begin
+        set_safety_level!(3)
+
+        err = try
+            @with_pool :cuda pool begin
+                v = acquire!(pool, Float32, 10)
+                _cuda_test_leak(v)
+            end
+            nothing
+        catch e
+            e
+        end
+
+        @test err isa PoolRuntimeEscapeError
+        @test err.callsite !== nothing
+        @test contains(err.callsite, ":")  # "file:line" format
+
+        set_safety_level!(0)
+    end
+
+    @testset "@with_pool :cuda Level 3 callsite includes expression text" begin
+        set_safety_level!(3)
+
+        err = try
+            @with_pool :cuda pool begin
+                v = zeros!(pool, Float32, 10)
+                _cuda_test_leak(v)
+            end
+            nothing
+        catch e
+            e
+        end
+
+        @test err isa PoolRuntimeEscapeError
+        @test err.callsite !== nothing
+        @test contains(err.callsite, "\n")
+        @test contains(err.callsite, "zeros!(pool, Float32, 10)")
+
+        set_safety_level!(0)
+    end
+
+    @testset "LV<3 does not create borrow log on CUDA" begin
+        for lv in (0, 1, 2)
+            pool = _make_cuda_pool(lv)
+            checkpoint!(pool)
+            _ = acquire!(pool, Float32, 10)
+            @test pool._borrow_log === nothing
+            rewind!(pool)
+        end
+    end
+
+    @testset "LV=3 creates borrow log on CUDA acquire" begin
+        pool = _make_cuda_pool(3)
+        checkpoint!(pool)
+        _ = acquire!(pool, Float32, 10)
+        @test pool._borrow_log !== nothing
+        @test pool._borrow_log isa IdDict
+        rewind!(pool)
+    end
+
+    # ==============================================================================
+    # Error message content (showerror)
+    # ==============================================================================
+
+    @testset "showerror: CuArray escape error message format" begin
+        # LV≥2 without callsite → "Tip: set LV=3"
+        err = PoolRuntimeEscapeError("CuArray{Float32, 1}", "Float32", nothing, nothing)
+        io = IOBuffer()
+        showerror(io, err)
+        msg = String(take!(io))
+
+        @test contains(msg, "PoolEscapeError")
+        @test contains(msg, "CuArray{Float32, 1}")
+        @test contains(msg, "Float32")
+        @test contains(msg, "POOL_SAFETY_LV ≥ 2")
+        @test contains(msg, "Tip:")
+        @test contains(msg, "POOL_SAFETY_LV[] = 3")
+    end
+
+    @testset "showerror: CuArray with callsite (LV≥3)" begin
+        err = PoolRuntimeEscapeError(
+            "CuArray{Float32, 1}", "Float32",
+            "test_cuda.jl:42\nacquire!(pool, Float32, 10)", nothing
+        )
+        io = IOBuffer()
+        showerror(io, err)
+        msg = String(take!(io))
+
+        @test contains(msg, "acquired at")
+        @test contains(msg, "test_cuda.jl:42")
+        @test contains(msg, "acquire!(pool, Float32, 10)")
+        @test contains(msg, "POOL_SAFETY_LV ≥ 3")
+        @test !contains(msg, "Tip:")  # No tip when callsite is present
+    end
+
+    # ==============================================================================
+    # Function form: @with_pool :cuda pool function ...
+    # ==============================================================================
+
+    @testset "Function form: escape detection with explicit return" begin
+        set_safety_level!(2)
+
+        @with_pool :cuda pool function _cuda_test_return_escape()
+            v = acquire!(pool, Float32, 10)
+            return _cuda_test_leak(v)
+        end
+
+        @test_throws PoolRuntimeEscapeError _cuda_test_return_escape()
+
+        set_safety_level!(0)
+    end
+
+    @testset "Function form: safe scalar return passes" begin
+        set_safety_level!(2)
+
+        @with_pool :cuda pool function _cuda_test_safe_return()
+            v = acquire!(pool, Float32, 5)
+            CUDA.fill!(v, 4.0f0)
+            return sum(Array(v))
+        end
+
+        @test _cuda_test_safe_return() == 20.0f0
+
+        set_safety_level!(0)
+    end
+
+    @testset "Function form: bare return (nothing) passes" begin
+        set_safety_level!(2)
+
+        @with_pool :cuda pool function _cuda_test_bare_return()
+            _ = acquire!(pool, Float32, 10)
+            return
+        end
+
+        @test _cuda_test_bare_return() === nothing
+
+        set_safety_level!(0)
+    end
+
+end  # CUDA Safety Dispatch