diff --git a/.codecov.yml b/.codecov.yml index a393180b..4caafbf6 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -10,6 +10,7 @@ coverage: ignore: - "ext/**/*" + - "src/legacy/**/*" comment: layout: "reach,diff,flags,files" diff --git a/Project.toml b/Project.toml index 463d051d..c61ccfa7 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "AdaptiveArrayPools" uuid = "4f381ef7-9af0-4cbe-99d4-cf36d7b0f233" -version = "0.2.1" authors = ["Min-Gu Yoo "] +version = "0.2.1" [deps] Preferences = "21216c6a-2e73-6563-6e65-726566657250" @@ -14,7 +14,7 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" AdaptiveArrayPoolsCUDAExt = "CUDA" [compat] -julia = "1.10" -Preferences = "1" CUDA = "5" -Printf = "1" \ No newline at end of file +Preferences = "1" +Printf = "1" +julia = "1.10" diff --git a/docs/design/cuda_extension_design.md b/docs/design/cuda_extension_design.md index 5b31c2d8..f7eb7ee7 100644 --- a/docs/design/cuda_extension_design.md +++ b/docs/design/cuda_extension_design.md @@ -1,9 +1,10 @@ # AdaptiveArrayPools.jl CUDA Extension Design -> **Status**: Draft v0.6 (Post-Review Revision) -> **Version**: 0.6 -> **Date**: 2024-12-14 -> **Authors**: Design discussion with AI assistance +> **Update (v0.2.2, feat/new_array_nd)**: The CPU path now uses `setfield!`-based wrapper +> reuse (Julia 1.11+) instead of the N-way cache for `unsafe_acquire!`. The **CUDA extension +> still uses the N-way set-associative cache** described in this document, since `CuArray` +> does not support `setfield!`-based field mutation. `CACHE_WAYS` and `set_cache_ways!` are +> now only relevant for the CUDA backend (and Julia 1.10 legacy CPU path). ## 1. Executive Summary diff --git a/docs/design/hybrid_api_design.md b/docs/design/hybrid_api_design.md index 3d8c84a4..b05d0f17 100644 --- a/docs/design/hybrid_api_design.md +++ b/docs/design/hybrid_api_design.md @@ -1,5 +1,13 @@ # Hybrid API Design: acquire! vs unsafe_acquire! +> **Update (v0.2.2, feat/new_array_nd)**: The `unsafe_acquire!` path no longer uses +> `unsafe_wrap` + N-way cache on Julia 1.11+ CPU. Instead, it uses `setfield!`-based +> wrapper reuse — **0-alloc for any number of dimension patterns** (no eviction limit). +> The N-way cache (`CACHE_WAYS`) is now only used by the **CUDA** backend and the +> **Julia 1.10 legacy** fallback. The `acquire!` → `ReshapedArray` path is unchanged. +> `TypedPool` fields changed: `nd_arrays`/`nd_dims`/`nd_ptrs`/`nd_next_way` → +> `nd_wrappers::Vector{Union{Nothing, Vector{Any}}}`. + ## Executive Summary Redesigning `AdaptiveArrayPools.jl`'s N-D array acquisition API with a **Two Tools Strategy**: diff --git a/docs/design/nd_array_approach_comparison.md b/docs/design/nd_array_approach_comparison.md index d4a531fd..04653bff 100644 --- a/docs/design/nd_array_approach_comparison.md +++ b/docs/design/nd_array_approach_comparison.md @@ -1,5 +1,12 @@ # N-D Array Approach Comparison: unsafe_wrap vs ReshapedArray +> **Update (v0.2.2, feat/new_array_nd)**: The N-way set-associative cache described in this +> document has been **superseded on Julia 1.11+ CPU** by `setfield!`-based wrapper reuse +> (`nd_wrappers` indexed by dimensionality N). This achieves **0-alloc for unlimited dimension +> patterns** — no eviction, no `CACHE_WAYS` limit. The N-way cache remains in use for +> **CUDA** and the **Julia 1.10 legacy** path. The `acquire!` → `ReshapedArray` path is +> unchanged. See `src/acquire.jl` and `src/types.jl` for the current implementation. + ## Summary This document analyzes two approaches for returning N-dimensional arrays from AdaptiveArrayPools: diff --git a/docs/src/architecture/design-docs.md b/docs/src/architecture/design-docs.md index 4b5e0514..8d9b68a3 100644 --- a/docs/src/architecture/design-docs.md +++ b/docs/src/architecture/design-docs.md @@ -10,7 +10,7 @@ For in-depth analysis of design decisions, implementation tradeoffs, and archite ## Caching & Performance - **[nd_array_approach_comparison.md](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/nd_array_approach_comparison.md)** - N-way cache design, boxing analysis, and ReshapedArray benchmarks + N-way cache design (now legacy — replaced by `setfield!` reuse on Julia 1.11+ CPU), boxing analysis, and ReshapedArray benchmarks - **[fixed_slots_codegen_design.md](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/fixed_slots_codegen_design.md)** Zero-allocation iteration via `@generated` functions and fixed-slot type dispatch @@ -32,7 +32,7 @@ For in-depth analysis of design decisions, implementation tradeoffs, and archite | Document | Focus Area | Key Insights | |----------|------------|--------------| | hybrid_api_design | API strategy | View types for zero-alloc, Array for FFI | -| nd_array_approach_comparison | Caching | N-way associative cache reduces header allocation | +| nd_array_approach_comparison | Caching | N-way cache (legacy); setfield! reuse on Julia 1.11+ CPU | | fixed_slots_codegen_design | Codegen | @generated functions enable type-stable iteration | | untracked_acquire_design | Macro safety | Sentinel pattern ensures correct cleanup | | cuda_extension_design | GPU support | Seamless CPU/CUDA API parity | diff --git a/docs/src/architecture/how-it-works.md b/docs/src/architecture/how-it-works.md index 06ba9826..f812e5e0 100644 --- a/docs/src/architecture/how-it-works.md +++ b/docs/src/architecture/how-it-works.md @@ -82,57 +82,66 @@ end When you call `acquire!(pool, Float64, n)`, the compiler inlines directly to `pool.float64` — no dictionary lookup, no type instability. -## N-Way Set Associative Cache +## N-D Wrapper Reuse (CPU) -For `unsafe_acquire!` (which returns native `Array` types), we use an N-way cache to reduce header allocation: +For `unsafe_acquire!` (which returns native `Array` types), the caching strategy depends on the Julia version: + +### Julia 1.11+: `setfield!`-based Wrapper Reuse (Zero-Allocation) + +Julia 1.11 changed `Array` from an opaque C struct to a mutable Julia struct with `ref::MemoryRef{T}` and `size::NTuple{N,Int}` fields. This enables in-place mutation of cached `Array` wrappers via `setfield!`: ``` - CACHE_WAYS = 4 (default) - ┌────┬────┬────┬────┐ -Slot 0 (Float64): │way0│way1│way2│way3│ ← round-robin eviction - └────┴────┴────┴────┘ - ┌────┬────┬────┬────┐ -Slot 1 (Float32): │way0│way1│way2│way3│ - └────┴────┴────┴────┘ - ... +nd_wrappers[N][slot] → cached Array{T,N} + │ + ├─ setfield!(:ref, new_memory_ref) ← update backing memory (0-alloc) + └─ setfield!(:size, new_dims) ← update dimensions (0-alloc) ``` -### Cache Lookup Pseudocode +**Result**: Unlimited dimension patterns per slot with **zero allocation** after warmup. No eviction, no round-robin, no `CACHE_WAYS` limit. ```julia +# Pseudocode for Julia 1.11+ path function unsafe_acquire!(pool, T, dims...) typed_pool = get_typed_pool!(pool, T) - slot = n_active + 1 - base = (slot - 1) * CACHE_WAYS - - # Search all ways for matching dimensions - for k in 1:CACHE_WAYS - idx = base + k - if dims == typed_pool.nd_dims[idx] - # Cache hit! Check if underlying vector was resized - if pointer matches - return typed_pool.nd_arrays[idx] - end - end + flat_view = get_view!(typed_pool, prod(dims)) + slot = typed_pool.n_active + + # Direct index lookup by dimensionality N (~1ns) + wrapper = typed_pool.nd_wrappers[N][slot] + if wrapper !== nothing + setfield!(wrapper, :ref, getfield(vec, :ref)) # 0-alloc + setfield!(wrapper, :size, dims) # 0-alloc + return wrapper end - # Cache miss: create new Array header, store in next way (round-robin) - way = typed_pool.nd_next_way[slot] - typed_pool.nd_next_way[slot] = (way + 1) % CACHE_WAYS - # ... create and cache Array ... + # First call for this (slot, N): unsafe_wrap once, cached forever + arr = wrap_array(typed_pool, flat_view, dims) + store_wrapper!(typed_pool, N, slot, arr) + return arr end ``` -**Key insight**: Even on cache miss, only the `Array` header (~80-144 bytes) is allocated. The actual data memory is always reused from the pool. +### Julia 1.10 (Legacy): N-Way Set Associative Cache + +On Julia 1.10, `Array` fields cannot be mutated, so the legacy path uses a 4-way set-associative cache with round-robin eviction: + +- Cache hit (≤`CACHE_WAYS` dimension patterns per slot): **0 bytes** +- Cache miss (>`CACHE_WAYS` patterns): **~80-144 bytes** per `unsafe_wrap` call + +See [Configuration](../features/configuration.md) for `CACHE_WAYS` tuning (Julia 1.10 / CUDA only). + +### CUDA: N-Way Cache + +The CUDA backend still uses the N-way set-associative cache (same as Julia 1.10 legacy), since `CuArray` does not support `setfield!`-based mutation. ## View vs Array Return Types Type stability is critical for performance. AdaptiveArrayPools provides two APIs: -| API | 1D Return | N-D Return | Allocation | -|-----|-----------|------------|------------| -| `acquire!` | `SubArray{T,1}` | `ReshapedArray{T,N}` | Always 0 bytes | -| `unsafe_acquire!` | `Vector{T}` | `Array{T,N}` | 0 bytes (hit) / ~100 bytes (miss) | +| API | 1D Return | N-D Return | Allocation (Julia 1.11+) | Allocation (Julia 1.10 / CUDA) | +|-----|-----------|------------|--------------------------|-------------------------------| +| `acquire!` | `SubArray{T,1}` | `ReshapedArray{T,N}` | Always 0 bytes | Always 0 bytes | +| `unsafe_acquire!` | `Vector{T}` | `Array{T,N}` | 0 bytes (setfield! reuse) | 0 bytes (hit) / ~100 bytes (miss) | !!! note "`Bit` type behavior" For `T === Bit`, both `acquire!` and `unsafe_acquire!` return native `BitVector` / `BitArray{N}` (not views). Cache hit achieves 0 bytes allocation. diff --git a/docs/src/architecture/type-dispatch.md b/docs/src/architecture/type-dispatch.md index 1feeb6fa..416d7fd9 100644 --- a/docs/src/architecture/type-dispatch.md +++ b/docs/src/architecture/type-dispatch.md @@ -21,57 +21,39 @@ end When you call `acquire!(pool, Float64, n)`, the compiler inlines directly to `pool.float64` - no dictionary lookup, no type instability. -## N-Way Set Associative Cache +## N-D Wrapper Caching for `unsafe_acquire!` -For `unsafe_acquire!` (which returns native `Array` types), we use an N-way cache to reduce header allocation: +`unsafe_acquire!` returns native `Array` types. The caching strategy depends on Julia version: -``` - CACHE_WAYS = 4 (default) - +----+----+----+----+ -Slot 0 (Float64): |way0|way1|way2|way3| <-- round-robin eviction - +----+----+----+----+ - +----+----+----+----+ -Slot 1 (Float32): |way0|way1|way2|way3| - +----+----+----+----+ - ... -``` +### Julia 1.11+: `setfield!`-based Wrapper Reuse -### Cache Lookup Logic +Julia 1.11 made `Array` a mutable struct, enabling in-place field mutation: ```julia -function unsafe_acquire!(pool, T, dims...) - typed_pool = get_typed_pool!(pool, T) - slot = n_active + 1 - base = (slot - 1) * CACHE_WAYS - - # Search all ways for matching dimensions - for k in 1:CACHE_WAYS - idx = base + k - if dims == typed_pool.nd_dims[idx] - # Cache hit! Check if underlying vector was resized - if pointer matches - return typed_pool.nd_arrays[idx] - end - end - end - - # Cache miss: create new Array header, store in next way (round-robin) - way = typed_pool.nd_next_way[slot] - typed_pool.nd_next_way[slot] = (way % CACHE_WAYS) + 1 - # ... create and cache Array ... -end +# Cached wrapper reuse via setfield! (0-alloc) +setfield!(cached_arr, :ref, new_memory_ref) # update backing memory +setfield!(cached_arr, :size, new_dims) # update dimensions ``` -**Key insight**: Even on cache miss, only the `Array` header (~80-144 bytes) is allocated. The actual data memory is always reused from the pool. +Wrappers are stored in `nd_wrappers::Vector{Union{Nothing, Vector{Any}}}`, indexed directly by dimensionality N (~1ns lookup). **Unlimited dimension patterns per slot, zero allocation after warmup.** + +### Julia 1.10 / CUDA: N-Way Set Associative Cache + +On Julia 1.10 (CPU) and CUDA, `Array`/`CuArray` fields cannot be mutated. These paths use a 4-way set-associative cache with round-robin eviction (`CACHE_WAYS = 4` default): + +- **Cache hit** (≤4 dim patterns per slot): 0 bytes +- **Cache miss** (>4 patterns): ~80-144 bytes for Array header allocation + +See [Configuration](../features/configuration.md) for `CACHE_WAYS` tuning. --- ## View vs Array: When to Use What? -| API | Return Type | Allocation | Recommended For | -|-----|-------------|------------|-----------------| -| `acquire!` | `SubArray` / `ReshapedArray` | **Always 0 bytes** | 99% of cases | -| `unsafe_acquire!` | `Vector` / `Array` | 0-144 bytes | FFI, type constraints | +| API | Return Type | Allocation (Julia 1.11+) | Allocation (1.10 / CUDA) | Recommended For | +|-----|-------------|--------------------------|--------------------------|-----------------| +| `acquire!` | `SubArray` / `ReshapedArray` | **Always 0 bytes** | **Always 0 bytes** | 99% of cases | +| `unsafe_acquire!` | `Vector` / `Array` | **0 bytes** (setfield! reuse) | 0-144 bytes (N-way cache) | FFI, type constraints | ### Why View is the Default @@ -116,15 +98,15 @@ end | Operation | acquire! (View) | unsafe_acquire! (Array) | |-----------|-----------------|-------------------------| -| Allocation (cached) | 0 bytes | 0 bytes | -| Allocation (miss) | 0 bytes | 80-144 bytes | +| Allocation (Julia 1.11+) | 0 bytes | 0 bytes (setfield! reuse) | +| Allocation (Julia 1.10 / CUDA) | 0 bytes | 0 bytes (hit) / 80-144 bytes (miss) | | BLAS operations | Identical | Identical | | Type stability | Guaranteed | Guaranteed | | FFI compatibility | Requires conversion | Direct | -### Header Size by Dimensionality +### Header Size by Dimensionality (Julia 1.10 / CUDA only) -When `unsafe_acquire!` has a cache miss: +On Julia 1.11+ CPU, `unsafe_acquire!` is always zero-allocation via `setfield!` reuse. On Julia 1.10 and CUDA, a cache miss allocates an `Array` header: | Dimensions | Header Size | |------------|-------------| diff --git a/docs/src/basics/api-essentials.md b/docs/src/basics/api-essentials.md index c1bd503a..028bbaed 100644 --- a/docs/src/basics/api-essentials.md +++ b/docs/src/basics/api-essentials.md @@ -21,7 +21,7 @@ end ### `unsafe_acquire!(pool, T, dims...)` -Returns a native `Array` type. **Zero-allocation on cache hit**—only allocates a small header (~80-144 bytes) on cache miss. Use when you specifically need `Array{T,N}`: +Returns a native `Array` type. On **Julia 1.11+**, always **zero-allocation** via `setfield!`-based wrapper reuse (unlimited dimension patterns). On Julia 1.10 and CUDA, zero-allocation on cache hit with a small header (~80-144 bytes) on cache miss. Use when you specifically need `Array{T,N}`: ```julia @with_pool pool begin @@ -36,7 +36,7 @@ end ``` !!! tip "Cache behavior" - Same dimension pattern → **0 bytes**. Different pattern → 80-144 bytes header only (data memory always reused). See [N-Way Cache](../architecture/type-dispatch.md#n-way-set-associative-cache) for details. + On Julia 1.11+: **always 0 bytes** regardless of dimension pattern (setfield!-based reuse). On Julia 1.10 / CUDA: same dimension pattern → 0 bytes, different pattern → 80-144 bytes header only (data always reused). See [N-D Wrapper Caching](../architecture/type-dispatch.md#n-d-wrapper-caching-for-unsafe_acquire) for details. !!! note "`Bit` behavior" For `T === Bit`, `unsafe_acquire!` is equivalent to `acquire!` and returns native `BitVector`/`BitArray{N}`. @@ -113,7 +113,7 @@ end | Function | Returns | Allocation | Use Case | |----------|---------|------------|----------| | `acquire!(pool, T, dims...)` | View type | 0 bytes | Default choice | -| `unsafe_acquire!(pool, T, dims...)` | `Array{T,N}` | 0 (hit) / 80-144 (miss) | FFI, type constraints | +| `unsafe_acquire!(pool, T, dims...)` | `Array{T,N}` | 0 bytes (1.11+) / 0-144 (1.10/CUDA) | FFI, type constraints | | `zeros!(pool, [T,] dims...)` | View type | 0 bytes | Zero-initialized | | `ones!(pool, [T,] dims...)` | View type | 0 bytes | One-initialized | | `similar!(pool, A)` | View type | 0 bytes | Match existing array | diff --git a/docs/src/features/bit-arrays.md b/docs/src/features/bit-arrays.md index c59f4f40..37048004 100644 --- a/docs/src/features/bit-arrays.md +++ b/docs/src/features/bit-arrays.md @@ -79,16 +79,14 @@ Operations like `count()`, `sum()`, and bitwise broadcasting are **10x~100x fast ### N-D Caching & Zero Allocation -The pool uses an N-way associative cache to efficiently reuse `BitArray{N}` instances: +The pool reuses `BitArray{N}` wrapper instances via `setfield!`-based in-place mutation (Julia 1.11+) or N-way cache (Julia 1.10 / CUDA): -| Scenario | Allocation | -|----------|------------| -| First call with new dims | ~944 bytes (new `BitArray{N}` created) | -| Subsequent call with same dims | **0 bytes** (cached instance reused) | -| Same ndims, different dims | **0 bytes** (dims/len fields modified in-place) | -| Different ndims | ~944 bytes (new `BitArray{N}` created and cached) | +| Scenario | Julia 1.11+ | Julia 1.10 / CUDA | +|----------|-------------|-------------------| +| First call with new (slot, N) | ~944 bytes (new `BitArray{N}`) | ~944 bytes | +| Subsequent call, any dims | **0 bytes** (setfield! reuse) | **0 bytes** (same ndims) / ~944 bytes (different ndims) | -Unlike regular `Array` where dimensions are immutable, `BitArray` allows in-place modification of its `dims` and `len` fields. The pool exploits this to achieve **zero allocation** on repeated calls with matching dimensionality. +On Julia 1.11+, `BitArray` fields (`len`, `dims`, `chunks`) are mutated in-place via `setfield!`, achieving **zero allocation** on all repeated calls regardless of dimension pattern. ```julia @with_pool pool begin @@ -98,12 +96,12 @@ Unlike regular `Array` where dimensions are immutable, `BitArray` allows in-plac # Rewind to reuse the same slot rewind!(pool) - # Same dims: 0 allocation (exact cache hit) + # Same dims: 0 allocation (cached wrapper reused) m2 = acquire!(pool, Bit, 100, 100) rewind!(pool) - # Different dims but same ndims: 0 allocation (dims modified in-place) + # Different dims but same ndims: 0 allocation (fields updated in-place) m3 = acquire!(pool, Bit, 50, 200) end ``` diff --git a/docs/src/features/configuration.md b/docs/src/features/configuration.md index fdc1d168..4cc3fd0c 100644 --- a/docs/src/features/configuration.md +++ b/docs/src/features/configuration.md @@ -70,9 +70,13 @@ POOL_DEBUG[] = false # Disable (default, production) When enabled, returning a pool-backed array from a `@with_pool` block will throw an error. -## Compile-time: CACHE_WAYS +## Compile-time: CACHE_WAYS (Julia 1.10 / CUDA only) -Configure the N-way cache size for `unsafe_acquire!`. Higher values reduce cache eviction but increase memory per slot. +Configure the N-way cache size for `unsafe_acquire!`. **On Julia 1.11+ CPU, this setting has no effect** — the `setfield!`-based wrapper reuse supports unlimited dimension patterns with zero allocation. + +This setting is relevant for: +- **Julia 1.10** (legacy N-way cache path) +- **CUDA backend** (N-way cache for `CuArray` wrappers) ```toml # LocalPreferences.toml @@ -88,15 +92,13 @@ set_cache_ways!(8) # Restart Julia for changes to take effect ``` -**When to increase**: If your code alternates between more than 4 dimension patterns per pool slot, increase `cache_ways` to avoid cache eviction (~100 bytes header per miss). - -> **Scope**: `cache_ways` affects **all `unsafe_acquire!`** calls (including 1D). Only `acquire!` 1D uses simple 1:1 caching. +**When to increase**: If your CUDA code or Julia 1.10 code alternates between more than 4 dimension patterns per pool slot, increase `cache_ways` to avoid cache eviction (~100 bytes header per miss). ## Summary | Setting | Scope | Restart? | Priority | Affects | |---------|-------|----------|----------|---------| | `use_pooling` | Compile-time | Yes | ⭐ Primary | All macros, `acquire!` behavior | -| `cache_ways` | Compile-time | Yes | Advanced | `unsafe_acquire!` N-D caching | +| `cache_ways` | Compile-time | Yes | Advanced | `unsafe_acquire!` N-D caching (Julia 1.10 / CUDA only) | | `MAYBE_POOLING_ENABLED` | Runtime | No | Optional | `@maybe_with_pool` only | | `POOL_DEBUG` | Runtime | No | Debug | Safety validation | diff --git a/docs/src/features/cuda-support.md b/docs/src/features/cuda-support.md index c5778c8c..13b2ce50 100644 --- a/docs/src/features/cuda-support.md +++ b/docs/src/features/cuda-support.md @@ -47,12 +47,17 @@ The CUDA backend uses the same API as CPU, with `:cuda` backend specifier: **GPU Memory**: Always 0 bytes allocation after warmup. The underlying `CuVector` is resized as needed and reused. -**CPU Memory**: -- Cache hit (≤4 dimension patterns per slot): 0 bytes -- Cache miss (>4 patterns): ~100 bytes for wrapper metadata +**CPU-side Wrapper Memory** (for `unsafe_acquire!` N-D on CUDA): +- The CUDA backend uses an N-way set-associative cache for `CuArray` wrapper reuse +- Cache hit (≤`CACHE_WAYS` dimension patterns per slot): 0 bytes +- Cache miss (>`CACHE_WAYS` patterns): ~100 bytes for wrapper metadata +- See [Configuration](configuration.md) for `CACHE_WAYS` tuning + +!!! note "CPU vs CUDA caching" + On CPU (Julia 1.11+), `unsafe_acquire!` uses `setfield!`-based wrapper reuse with **zero allocation for any number of dimension patterns**. The CUDA backend does not yet support this optimization and still uses the N-way cache. ```julia -# Example: 4 patterns fit in 4-way cache → zero CPU allocation +# Example: 4 patterns fit in default 4-way cache → zero CPU-side allocation dims_list = ((10, 10), (5, 20), (20, 5), (4, 25)) for dims in dims_list @with_pool :cuda p begin diff --git a/docs/src/reference/api.md b/docs/src/reference/api.md index a6c5c55e..2b808ea0 100644 --- a/docs/src/reference/api.md +++ b/docs/src/reference/api.md @@ -51,7 +51,7 @@ Default element type is `Float64` (CPU) or `Float32` (CUDA). | `USE_POOLING` | Compile-time constant to disable all pooling. | | `MAYBE_POOLING_ENABLED` | Runtime `Ref{Bool}` for `@maybe_with_pool`. | | `POOL_DEBUG` | Runtime `Ref{Bool}` to enable safety validation. | -| `set_cache_ways!(n)` | Set N-way cache size. | +| `set_cache_ways!(n)` | Set N-way cache size (Julia 1.10 / CUDA only; no effect on Julia 1.11+ CPU). | --- diff --git a/src/AdaptiveArrayPools.jl b/src/AdaptiveArrayPools.jl index 7092822f..c369ca5b 100644 --- a/src/AdaptiveArrayPools.jl +++ b/src/AdaptiveArrayPools.jl @@ -19,28 +19,25 @@ export AbstractTypedPool, AbstractArrayPool # For subtyping export DisabledPool, DISABLED_CPU, pooling_enabled # Disabled pool support # Note: Extensions add methods to _get_pool_for_backend(::Val{:backend}) directly -# Core data structures -include("types.jl") - -# Debugging & validation utilities (needed by macros) -include("utils.jl") - -# Acquisition operations: get_view!, acquire!, unsafe_acquire!, aliases -include("acquire.jl") - -# BitArray-specific acquisition (SIMD-optimized BitVector operations) -include("bitarray.jl") - -# Convenience functions: zeros!, ones!, similar! -include("convenience.jl") - -# State management: checkpoint!, rewind!, reset!, empty! -include("state.jl") - -# Task-local pool -include("task_local_pool.jl") - -# Macros: @with_pool, @maybe_with_pool -include("macros.jl") +# All includes grouped under a single version branch +@static if VERSION >= v"1.11-" + include("types.jl") + include("utils.jl") + include("acquire.jl") + include("bitarray.jl") + include("convenience.jl") + include("state.jl") + include("task_local_pool.jl") + include("macros.jl") +else + include("legacy/types.jl") + include("utils.jl") + include("legacy/acquire.jl") + include("legacy/bitarray.jl") + include("convenience.jl") + include("legacy/state.jl") + include("task_local_pool.jl") + include("macros.jl") +end end # module diff --git a/src/acquire.jl b/src/acquire.jl index b0326f1a..b9c26118 100644 --- a/src/acquire.jl +++ b/src/acquire.jl @@ -91,60 +91,80 @@ function get_view!(tp::AbstractTypedPool{T}, n::Int) where {T} end # ============================================================================== -# Get N-D Array/View (Internal - Zero-Allocation Cache) +# Get N-D Array (setfield!-based Wrapper Reuse, Julia 1.11+) # ============================================================================== +# +# Julia 1.11+ changed Array to mutable struct {ref::MemoryRef{T}, size::NTuple{N,Int}}, +# enabling in-place mutation via setfield!. This eliminates N-way cache eviction limits: +# unlimited dimension patterns per slot, 0-alloc after warmup for any dims with same N. """ - get_nd_array!(tp::AbstractTypedPool{T}, dims::NTuple{N,Int}) -> Array{T,N} + _store_nd_wrapper!(tp::AbstractTypedPool, N::Int, slot::Int, wrapper) -Get an N-dimensional `Array` from the pool with N-way caching. +Store a cached N-D wrapper for the given slot. Creates the per-N Vector if needed. """ -@inline function get_nd_array!(tp::AbstractTypedPool{T}, dims::NTuple{N, Int}) where {T, N} - total_len = safe_prod(dims) - flat_view = get_view!(tp, total_len) # Increments n_active - slot = tp.n_active - - @inbounds vec = tp.vectors[slot] - current_ptr = UInt(pointer(vec)) - - # Expand cache slots if needed (CACHE_WAYS entries per slot) - n_slots_cached = length(tp.nd_next_way) - while slot > n_slots_cached - for _ in 1:CACHE_WAYS - push!(tp.nd_arrays, nothing) - push!(tp.nd_dims, nothing) - push!(tp.nd_ptrs, UInt(0)) +function _store_nd_wrapper!(tp::AbstractTypedPool, N::Int, slot::Int, wrapper) + # Grow nd_wrappers vector so index N is valid + if N > length(tp.nd_wrappers) + old_len = length(tp.nd_wrappers) + resize!(tp.nd_wrappers, N) + for i in (old_len+1):N + @inbounds tp.nd_wrappers[i] = nothing end - push!(tp.nd_next_way, 0) - n_slots_cached += 1 end - - base = (slot - 1) * CACHE_WAYS - - # Linear Search across all ways (Cache hit = 0 bytes) - for k in 1:CACHE_WAYS - cache_idx = base + k - @inbounds cached_dims = tp.nd_dims[cache_idx] - @inbounds cached_ptr = tp.nd_ptrs[cache_idx] - - if cached_dims isa NTuple{N, Int} && cached_dims == dims && cached_ptr == current_ptr - return @inbounds tp.nd_arrays[cache_idx]::Array{T, N} + wrappers = @inbounds tp.nd_wrappers[N] + if wrappers === nothing + wrappers = Vector{Any}(nothing, slot) + @inbounds tp.nd_wrappers[N] = wrappers + elseif slot > length(wrappers) + old_len = length(wrappers) + resize!(wrappers, slot) + for i in (old_len+1):slot + @inbounds wrappers[i] = nothing end end + @inbounds wrappers[slot] = wrapper + nothing +end - # Cache Miss - Round-Robin Replacement - @inbounds way_offset = tp.nd_next_way[slot] - target_idx = base + way_offset + 1 +""" + get_nd_array!(tp::AbstractTypedPool{T,Vector{T}}, dims::NTuple{N,Int}) -> Array{T,N} - arr = wrap_array(tp, flat_view, dims) +Get an N-dimensional `Array` from the pool with `setfield!`-based wrapper reuse. - @inbounds tp.nd_arrays[target_idx] = arr - @inbounds tp.nd_dims[target_idx] = dims - @inbounds tp.nd_ptrs[target_idx] = current_ptr +Uses Julia 1.11+ `setfield!` to mutate cached `Array` wrappers in-place: +- Same N (dimensionality): `setfield!(arr, :size, dims)` — 0 allocation +- Backing memory: `setfield!(arr, :ref, ...)` — always updated, 0 allocation in compiled code +- First call per (slot, N): `unsafe_wrap` once, then cached forever - # Update round-robin counter - @inbounds tp.nd_next_way[slot] = (way_offset + 1) % CACHE_WAYS +Unlike the N-way cache (Julia 1.10), this has no eviction limit — unlimited dimension +patterns per slot are supported with zero allocation after warmup. +""" +@inline function get_nd_array!(tp::AbstractTypedPool{T, Vector{T}}, dims::NTuple{N, Int}) where {T, N} + total_len = safe_prod(dims) + flat_view = get_view!(tp, total_len) # Increments n_active, ensures backing vec + slot = tp.n_active + @inbounds vec = tp.vectors[slot] + # Look up cached wrapper for this dimensionality (direct index, no hash) + wrappers = N <= length(tp.nd_wrappers) ? (@inbounds tp.nd_wrappers[N]) : nothing + if wrappers !== nothing && slot <= length(wrappers) + wrapper = @inbounds wrappers[slot] + if wrapper !== nothing + arr = wrapper::Array{T, N} + # Always update ref: resize! can grow in-place without changing pointer, + # but the old MemoryRef still has the old (smaller) Memory length. + # setfield!(:ref) is 0-alloc in compiled code (only 32B at REPL top-level). + setfield!(arr, :ref, getfield(vec, :ref)) + # Update dimensions (0-alloc: NTuple stored inline in mutable Array) + setfield!(arr, :size, dims) + return arr + end + end + + # Cache miss: first call for this (slot, N) — unsafe_wrap once + arr = wrap_array(tp, flat_view, dims) + _store_nd_wrapper!(tp, N, slot, arr) return arr end diff --git a/src/bitarray.jl b/src/bitarray.jl index eb9addf9..8e9e4ab3 100644 --- a/src/bitarray.jl +++ b/src/bitarray.jl @@ -1,13 +1,13 @@ # ============================================================================== -# BitArray Acquisition (N-D Cached BitArray API) +# BitArray Acquisition — Julia 1.11+ (setfield!-based Wrapper Reuse) # ============================================================================== # -# This file contains BitArray-specific pool operations, separated from the -# generic Array acquisition code in acquire.jl for maintainability. +# This file contains BitArray-specific pool operations for Julia 1.11+. +# Uses setfield!-based wrapper reuse for unlimited dim patterns, 0-alloc. # # Key components: # - Base.zero/one(::Type{Bit}) - Fill value dispatch for Bit sentinel type -# - get_bitarray! - N-D BitArray with shared chunks and N-way caching +# - get_bitarray! - N-D BitArray with setfield!-based caching # - _acquire_impl! for Bit - Delegates to _unsafe_acquire_impl! for performance # - _unsafe_acquire_impl! for Bit - Raw BitArray acquisition with caching # - DisabledPool fallbacks for Bit type @@ -15,27 +15,8 @@ # Design Decision: Unified BitArray Return Type # ============================================= # Unlike regular types where acquire! returns SubArray and unsafe_acquire! -# returns Array, for Bit type BOTH return BitArray{N}. This design choice is -# intentional for several reasons: -# -# 1. **SIMD Performance**: BitArray operations like `count()`, `sum()`, and -# bitwise operations are ~(10x ~ 100x) faster than their SubArray equivalents -# because they use SIMD-optimized chunked algorithms. -# -# 2. **API Simplicity**: Users always get BitArray regardless of which API -# they call. No need to remember "use unsafe_acquire! for performance". -# -# 3. **N-D Caching**: BitArray{N} can be reused by modifying dims/len fields -# when ndims matches, achieving 0 allocation on repeated calls. This is -# unique to BitArray - regular Array cannot modify dims in place. -# -# 4. **Backwards Compatibility**: Code using trues!/falses! just works with -# optimal performance - these convenience functions return BitVector. -# -# Implementation: -# - _acquire_impl!(pool, Bit, ...) delegates to _unsafe_acquire_impl! -# - get_bitarray! creates BitArray shells sharing pool's chunks -# - N-way cache stores BitArray{N} entries, reused via dims modification +# returns Array, for Bit type BOTH return BitArray{N}. This ensures users +# always get SIMD-optimized performance (~10-100x faster count/sum/bitwise). # ============================================================================== # ============================================================================== @@ -55,14 +36,7 @@ Get a BitArray{N} that shares `chunks` with the pooled BitVector. -Uses N-way cache for BitArray reuse. Unlike Array which requires unsafe_wrap -for each shape, BitArray can reuse cached entries by modifying `dims`/`len` -fields when ndims matches (0 bytes allocation). - -## Cache Strategy -- **Exact match**: Return cached BitArray directly (0 bytes) -- **Same ndims**: Modify dims/len/chunks of cached entry (0 bytes) -- **Different ndims**: Create new BitArray{N} and cache it (~944 bytes) +Uses `setfield!`-based wrapper reuse — unlimited dim patterns, 0-alloc after warmup. ## Implementation Notes - BitVector (N=1): `size()` uses `len` field, `dims` is ignored @@ -87,19 +61,8 @@ function get_bitarray!(tp::BitTypedPool, dims::NTuple{N,Int}) where {N} ba = BitArray{N}(undef, dims) ba.chunks = pool_bv.chunks - # Expand N-way cache (CACHE_WAYS entries per slot) - for _ in 1:CACHE_WAYS - push!(tp.nd_arrays, nothing) - push!(tp.nd_dims, nothing) - push!(tp.nd_ptrs, UInt(0)) - end - push!(tp.nd_next_way, 0) - - # Cache in first way - base = (idx - 1) * CACHE_WAYS + 1 - @inbounds tp.nd_arrays[base] = ba - @inbounds tp.nd_dims[base] = dims - @inbounds tp.nd_ptrs[base] = UInt(pointer(pool_bv.chunks)) + # Cache the wrapper + _store_nd_wrapper!(tp, N, idx, ba) # Warn at powers of 2 (possible missing rewind!) if idx >= 512 && (idx & (idx - 1)) == 0 @@ -115,44 +78,25 @@ function get_bitarray!(tp::BitTypedPool, dims::NTuple{N,Int}) where {N} if length(pool_bv) != total_len resize!(pool_bv, total_len) end - current_ptr = UInt(pointer(pool_bv.chunks)) - base = (idx - 1) * CACHE_WAYS - - # 3. Check N-way cache for hit - for k in 1:CACHE_WAYS - cache_idx = base + k - @inbounds cached_dims = tp.nd_dims[cache_idx] - @inbounds cached_ptr = tp.nd_ptrs[cache_idx] - - # Must check isa FIRST for type stability (avoids boxing in == comparison) - if cached_dims isa NTuple{N,Int} && cached_ptr == current_ptr - if cached_dims == dims - # Exact match - return cached BitArray directly (0 alloc) - return @inbounds tp.nd_arrays[cache_idx]::BitArray{N} - else - # Same ndims but different dims - reuse by modifying fields (0 alloc!) - ba = @inbounds tp.nd_arrays[cache_idx]::BitArray{N} - ba.len = total_len - ba.dims = dims - ba.chunks = pool_bv.chunks - # Update cache metadata - @inbounds tp.nd_dims[cache_idx] = dims - return ba - end + + # 3. Check wrapper cache (direct index, no hash) + wrappers = N <= length(tp.nd_wrappers) ? (@inbounds tp.nd_wrappers[N]) : nothing + if wrappers !== nothing && idx <= length(wrappers) + wrapper = @inbounds wrappers[idx] + if wrapper !== nothing + ba = wrapper::BitArray{N} + # Update fields in-place (all 0-alloc via setfield!) + setfield!(ba, :len, total_len) + setfield!(ba, :dims, dims) + setfield!(ba, :chunks, pool_bv.chunks) + return ba end end - # 4. Cache miss - create new BitArray{N} + # 4. Cache miss: first call for this (slot, N) ba = BitArray{N}(undef, dims) ba.chunks = pool_bv.chunks - - # Round-robin replacement - @inbounds way_offset = tp.nd_next_way[idx] - target_idx = base + way_offset + 1 - @inbounds tp.nd_arrays[target_idx] = ba - @inbounds tp.nd_dims[target_idx] = dims - @inbounds tp.nd_ptrs[target_idx] = current_ptr - @inbounds tp.nd_next_way[idx] = (way_offset + 1) % CACHE_WAYS + _store_nd_wrapper!(tp, N, idx, ba) return ba end diff --git a/src/legacy/acquire.jl b/src/legacy/acquire.jl new file mode 100644 index 00000000..eb47ab03 --- /dev/null +++ b/src/legacy/acquire.jl @@ -0,0 +1,479 @@ +# ============================================================================== +# Allocation Dispatch Points (for extensibility) +# ============================================================================== + +# Allocate a new vector (dispatch point for extensions) +@inline allocate_vector(::AbstractTypedPool{T,Vector{T}}, n::Int) where {T} = + Vector{T}(undef, n) + +# Wrap flat view into N-D array (dispatch point for extensions) +@inline function wrap_array(::AbstractTypedPool{T,Vector{T}}, + flat_view, dims::NTuple{N,Int}) where {T,N} + unsafe_wrap(Array{T,N}, pointer(flat_view), dims) +end + +# ============================================================================== +# Helper: Overflow-Safe Product +# ============================================================================== + +""" + safe_prod(dims::NTuple{N, Int}) -> Int + +Compute the product of dimensions with overflow checking. + +Throws `OverflowError` if the product exceeds `typemax(Int)`, preventing +memory corruption from integer overflow in `unsafe_wrap` operations. + +## Performance +Adds ~0.3-1.2 ns overhead (<1%) compared to unchecked `prod()`, which is +negligible relative to the 100-200 ns cost of the full allocation path. +""" +@inline function safe_prod(dims::NTuple{N, Int}) where {N} + total = 1 + for d in dims + total = Base.checked_mul(total, d) + end + return total +end + +# ============================================================================== +# Get 1D View (Internal - Zero-Allocation Cache) +# ============================================================================== + +""" + get_view!(tp::AbstractTypedPool{T}, n::Int) + +Get a 1D vector view of size `n` from the typed pool. +Returns cached view on hit (zero allocation), creates new on miss. +""" +function get_view!(tp::AbstractTypedPool{T}, n::Int) where {T} + tp.n_active += 1 + idx = tp.n_active + + # 1. Need to expand pool (new slot) + if idx > length(tp.vectors) + push!(tp.vectors, allocate_vector(tp, n)) + new_view = view(tp.vectors[idx], 1:n) + push!(tp.views, new_view) + push!(tp.view_lengths, n) + + # Warn at powers of 2 (512, 1024, 2048, ...) - possible missing rewind!() + if idx >= 512 && (idx & (idx - 1)) == 0 + total_bytes = sum(length, tp.vectors) * sizeof(T) + @warn "$(nameof(typeof(tp))){$T} growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?" + end + + return new_view + end + + # 2. Cache hit: same size requested -> return cached view (ZERO ALLOC) + @inbounds cached_len = tp.view_lengths[idx] + if cached_len == n + return @inbounds tp.views[idx] + end + + # 3. Cache miss: different size -> update cache + @inbounds vec = tp.vectors[idx] + if length(vec) < n + resize!(vec, n) + end + + new_view = view(vec, 1:n) + @inbounds tp.views[idx] = new_view + @inbounds tp.view_lengths[idx] = n + + return new_view +end + +# ============================================================================== +# Get N-D Array (N-way Set-Associative Cache, Julia ≤1.10) +# ============================================================================== +# +# On Julia ≤1.10, Array is not a mutable struct, so setfield! cannot be used. +# This provides the N-way cache that stores up to CACHE_WAYS different +# (dims, pointer) patterns per slot via round-robin replacement. + +""" + get_nd_array!(tp::AbstractTypedPool{T}, dims::NTuple{N,Int}) -> Array{T,N} + +Get an N-dimensional `Array` from the pool with N-way caching. + +Uses a set-associative cache with `CACHE_WAYS` entries per slot (default: 4). +Cache hit (exact dims + pointer match) returns the cached Array at zero cost. +Cache miss creates a new `unsafe_wrap`'d Array (~96 bytes) and stores it via +round-robin replacement. +""" +@inline function get_nd_array!(tp::AbstractTypedPool{T}, dims::NTuple{N, Int}) where {T, N} + total_len = safe_prod(dims) + flat_view = get_view!(tp, total_len) # Increments n_active + slot = tp.n_active + + @inbounds vec = tp.vectors[slot] + current_ptr = UInt(pointer(vec)) + + # Expand cache slots if needed (CACHE_WAYS entries per slot) + n_slots_cached = length(tp.nd_next_way) + while slot > n_slots_cached + for _ in 1:CACHE_WAYS + push!(tp.nd_arrays, nothing) + push!(tp.nd_dims, nothing) + push!(tp.nd_ptrs, UInt(0)) + end + push!(tp.nd_next_way, 0) + n_slots_cached += 1 + end + + base = (slot - 1) * CACHE_WAYS + + # Linear Search across all ways (Cache hit = 0 bytes) + for k in 1:CACHE_WAYS + cache_idx = base + k + @inbounds cached_dims = tp.nd_dims[cache_idx] + @inbounds cached_ptr = tp.nd_ptrs[cache_idx] + + if cached_dims isa NTuple{N, Int} && cached_dims == dims && cached_ptr == current_ptr + return @inbounds tp.nd_arrays[cache_idx]::Array{T, N} + end + end + + # Cache Miss - Round-Robin Replacement + @inbounds way_offset = tp.nd_next_way[slot] + target_idx = base + way_offset + 1 + + arr = wrap_array(tp, flat_view, dims) + + @inbounds tp.nd_arrays[target_idx] = arr + @inbounds tp.nd_dims[target_idx] = dims + @inbounds tp.nd_ptrs[target_idx] = current_ptr + + # Update round-robin counter + @inbounds tp.nd_next_way[slot] = (way_offset + 1) % CACHE_WAYS + + return arr +end + +""" + get_nd_view!(tp::AbstractTypedPool{T}, dims::NTuple{N,Int}) + +Get an N-dimensional view via `reshape` (zero creation cost). +""" +@inline function get_nd_view!(tp::AbstractTypedPool{T}, dims::NTuple{N, Int}) where {T, N} + total_len = safe_prod(dims) + flat_view = get_view!(tp, total_len) # 1D view (cached, 0 alloc) + return reshape(flat_view, dims) # ReshapedArray (0 creation cost) +end + +# ============================================================================== +# Type Touch Recording (for selective rewind) +# ============================================================================== + +""" + _record_type_touch!(pool::AbstractArrayPool, ::Type{T}) + +Record that type `T` was touched (acquired) at the current checkpoint depth. +Called by `acquire!` and convenience wrappers; macro-transformed calls use +`_acquire_impl!` directly (bypassing this for zero overhead). + +For fixed-slot types, sets the corresponding bit in `_touched_type_masks`. +For non-fixed-slot types, sets `_touched_has_others` flag. +""" +@inline function _record_type_touch!(pool::AbstractArrayPool, ::Type{T}) where {T} + depth = pool._current_depth + b = _fixed_slot_bit(T) + if b == UInt16(0) + @inbounds pool._touched_has_others[depth] = true + else + @inbounds pool._touched_type_masks[depth] |= b + end + nothing +end + +# CPU-specific override: adds lazy first-touch checkpoint in lazy mode +# and typed-lazy mode. +# _LAZY_MODE_BIT (bit 15) in _touched_type_masks[depth] ↔ depth entered via _lazy_checkpoint! +# _TYPED_LAZY_BIT (bit 14) in _touched_type_masks[depth] ↔ depth entered via _typed_lazy_checkpoint! +# On the first acquire of each fixed-slot type T at that depth, we retroactively save +# n_active BEFORE the acquire (current value is still the parent's count), so that +# the subsequent rewind can restore the parent's state correctly. +@inline function _record_type_touch!(pool::AdaptiveArrayPool, ::Type{T}) where {T} + depth = pool._current_depth + b = _fixed_slot_bit(T) + if b == UInt16(0) + @inbounds pool._touched_has_others[depth] = true + else + current_mask = @inbounds pool._touched_type_masks[depth] + # Lazy checkpoint: lazy mode (bit 15) OR typed lazy mode (bit 14), AND first touch. + # Guard: skip if already checkpointed at this depth (prevents double-push when a + # tracked type is also acquired by a helper via acquire! → _record_type_touch!). + if (current_mask & _MODE_BITS_MASK) != 0 && (current_mask & b) == 0 + tp = get_typed_pool!(pool, T) + if @inbounds(tp._checkpoint_depths[end]) != depth + _checkpoint_typed_pool!(tp, depth) + end + end + @inbounds pool._touched_type_masks[depth] = current_mask | b + end + nothing +end + +# ============================================================================== +# Internal Implementation Functions (called by macro-transformed code) +# ============================================================================== + +""" + _acquire_impl!(pool, Type{T}, n) -> SubArray{T,1,Vector{T},...} + _acquire_impl!(pool, Type{T}, dims...) -> ReshapedArray{T,N,...} + +Internal implementation of acquire!. Called directly by macro-transformed code +(no type touch recording). User code calls `acquire!` which adds recording. +""" +@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{T}, n::Int) where {T} + tp = get_typed_pool!(pool, T) + return get_view!(tp, n) +end + +@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} + tp = get_typed_pool!(pool, T) + return get_nd_view!(tp, dims) +end + +@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N} + _acquire_impl!(pool, T, dims...) +end + +# Similar-style +@inline _acquire_impl!(pool::AbstractArrayPool, x::AbstractArray) = _acquire_impl!(pool, eltype(x), size(x)) + +""" + _unsafe_acquire_impl!(pool, Type{T}, dims...) -> Array{T,N} + +Internal implementation of unsafe_acquire!. Called directly by macro-transformed code. +""" +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{T}, n::Int) where {T} + tp = get_typed_pool!(pool, T) + return get_nd_array!(tp, (n,)) +end + +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} + tp = get_typed_pool!(pool, T) + return get_nd_array!(tp, dims) +end + +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N} + tp = get_typed_pool!(pool, T) + return get_nd_array!(tp, dims) +end + +# Similar-style +@inline _unsafe_acquire_impl!(pool::AbstractArrayPool, x::AbstractArray) = _unsafe_acquire_impl!(pool, eltype(x), size(x)) + +# ============================================================================== +# Acquisition API (User-facing with type touch recording) +# ============================================================================== + +""" + acquire!(pool, Type{T}, n) -> view type + acquire!(pool, Type{T}, dims...) -> view type + acquire!(pool, Type{T}, dims::NTuple{N,Int}) -> view type + +Acquire a pooled array of type `T` with size `n` or dimensions `dims`. + +Returns a pooled array (backend-dependent type): +- **CPU 1D**: `SubArray{T,1,Vector{T},...}` (parent is `Vector{T}`) +- **CPU N-D**: `ReshapedArray{T,N,...}` (zero creation cost) +- **Bit** (`T === Bit`): `BitVector` / `BitArray{N}` (chunks-sharing, SIMD optimized) +- **CUDA**: `CuArray{T,N}` (unified N-way cache) + +For CPU numeric arrays, the return types are `StridedArray`, compatible with +BLAS and broadcasting. + +For type-unspecified paths (struct fields without concrete type parameters), +use [`unsafe_acquire!`](@ref) instead - cached native array instances can be reused. + +## Example +```julia +@with_pool pool begin + v = acquire!(pool, Float64, 100) # 1D view + m = acquire!(pool, Float64, 10, 10) # 2D view + v .= 1.0 + m .= 2.0 + sum(v) + sum(m) +end +``` + +See also: [`unsafe_acquire!`](@ref) for native array access. +""" +@inline function acquire!(pool::AbstractArrayPool, ::Type{T}, n::Int) where {T} + _record_type_touch!(pool, T) + _acquire_impl!(pool, T, n) +end + +# Multi-dimensional support (zero-allocation with N-D cache) +@inline function acquire!(pool::AbstractArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} + _record_type_touch!(pool, T) + _acquire_impl!(pool, T, dims...) +end + +# Tuple support: allows acquire!(pool, T, size(A)) where size(A) returns NTuple{N,Int} +@inline function acquire!(pool::AbstractArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N} + _record_type_touch!(pool, T) + _acquire_impl!(pool, T, dims...) +end + +# Similar-style convenience methods +""" + acquire!(pool, x::AbstractArray) -> SubArray + +Acquire an array with the same element type and size as `x` (similar to `similar(x)`). + +## Example +```julia +A = rand(10, 10) +@with_pool pool begin + B = acquire!(pool, A) # Same type and size as A + B .= A .* 2 +end +``` +""" +@inline function acquire!(pool::AbstractArrayPool, x::AbstractArray) + _record_type_touch!(pool, eltype(x)) + _acquire_impl!(pool, eltype(x), size(x)) +end + +# ============================================================================== +# Unsafe Acquisition API (Raw Arrays) +# ============================================================================== + +""" + unsafe_acquire!(pool, Type{T}, n) -> backend's native array type + unsafe_acquire!(pool, Type{T}, dims...) -> backend's native array type + unsafe_acquire!(pool, Type{T}, dims::NTuple{N,Int}) -> backend's native array type + +Acquire a native array backed by pool memory. + +Returns the backend's native array type: +- **CPU**: `Array{T,N}` (via `unsafe_wrap`) +- **Bit** (`T === Bit`): `BitVector` / `BitArray{N}` (chunks-sharing; equivalent to `acquire!`) +- **CUDA**: `CuArray{T,N}` (via unified view cache) + +## Safety Warning +The returned array is only valid within the `@with_pool` scope. Using it after +the scope ends leads to undefined behavior (use-after-free, data corruption). + +**Do NOT call `resize!`, `push!`, or `append!` on returned arrays** - this causes +undefined behavior as the memory is owned by the pool. + +## When to Use +- **Type-unspecified paths**: Struct fields without concrete type parameters +- FFI calls expecting raw pointers +- APIs that strictly require native array types + +## Example +```julia +@with_pool pool begin + A = unsafe_acquire!(pool, Float64, 100, 100) # Matrix{Float64} + B = unsafe_acquire!(pool, Float64, 100, 100) + C = similar(A) # Regular allocation for result + mul!(C, A, B) # BLAS uses A, B directly +end +# A and B are INVALID after this point! +``` + +See also: [`acquire!`](@ref) for view-based access. +""" +@inline function unsafe_acquire!(pool::AbstractArrayPool, ::Type{T}, n::Int) where {T} + _record_type_touch!(pool, T) + _unsafe_acquire_impl!(pool, T, n) +end + +@inline function unsafe_acquire!(pool::AbstractArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} + _record_type_touch!(pool, T) + _unsafe_acquire_impl!(pool, T, dims...) +end + +# Tuple support +@inline function unsafe_acquire!(pool::AbstractArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N} + _record_type_touch!(pool, T) + _unsafe_acquire_impl!(pool, T, dims) +end + +# Similar-style convenience methods +""" + unsafe_acquire!(pool, x::AbstractArray) -> Array + +Acquire a raw array with the same element type and size as `x` (similar to `similar(x)`). + +## Example +```julia +A = rand(10, 10) +@with_pool pool begin + B = unsafe_acquire!(pool, A) # Matrix{Float64}, same size as A + B .= A .* 2 +end +``` +""" +@inline function unsafe_acquire!(pool::AbstractArrayPool, x::AbstractArray) + _record_type_touch!(pool, eltype(x)) + _unsafe_acquire_impl!(pool, eltype(x), size(x)) +end + +# ============================================================================== +# API Aliases +# ============================================================================== + +""" + acquire_view!(pool, Type{T}, dims...) + +Alias for [`acquire!`](@ref). + +Explicit name emphasizing the return type is a view (`SubArray`/`ReshapedArray`), +not a raw `Array`. Use when you prefer symmetric naming with `acquire_array!`. +""" +const acquire_view! = acquire! + +""" + acquire_array!(pool, Type{T}, dims...) + +Alias for [`unsafe_acquire!`](@ref). + +Explicit name emphasizing the return type is a raw `Array`. +Use when you prefer symmetric naming with `acquire_view!`. +""" +const acquire_array! = unsafe_acquire! + +# Internal implementation aliases (for macro transformation) +const _acquire_view_impl! = _acquire_impl! +const _acquire_array_impl! = _unsafe_acquire_impl! + +# ============================================================================== +# DisabledPool Acquire Fallbacks (pooling disabled with backend context) +# ============================================================================== + +# --- acquire! for DisabledPool{:cpu} --- +@inline acquire!(::DisabledPool{:cpu}, ::Type{T}, n::Int) where {T} = Vector{T}(undef, n) +@inline acquire!(::DisabledPool{:cpu}, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = Array{T,N}(undef, dims) +@inline acquire!(::DisabledPool{:cpu}, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = Array{T,N}(undef, dims) +@inline acquire!(::DisabledPool{:cpu}, x::AbstractArray) = similar(x) + +# --- unsafe_acquire! for DisabledPool{:cpu} --- +@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{T}, n::Int) where {T} = Vector{T}(undef, n) +@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = Array{T,N}(undef, dims) +@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = Array{T,N}(undef, dims) +@inline unsafe_acquire!(::DisabledPool{:cpu}, x::AbstractArray) = similar(x) + +# --- Generic DisabledPool fallbacks (unknown backend → error) --- +@inline acquire!(::DisabledPool{B}, _args...) where {B} = _throw_backend_not_loaded(B) +@inline unsafe_acquire!(::DisabledPool{B}, _args...) where {B} = _throw_backend_not_loaded(B) + +# --- _impl! delegators for DisabledPool (macro transformation support) --- +# Called when: USE_POOLING=true + @maybe_with_pool + MAYBE_POOLING_ENABLED[]=false +# Explicit overloads for proper inlining (especially important for CUDA backend). +@inline _acquire_impl!(p::DisabledPool, ::Type{T}, n::Int) where {T} = acquire!(p, T, n) +@inline _acquire_impl!(p::DisabledPool, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = acquire!(p, T, dims...) +@inline _acquire_impl!(p::DisabledPool, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = acquire!(p, T, dims) +@inline _acquire_impl!(p::DisabledPool, x::AbstractArray) = acquire!(p, x) + +@inline _unsafe_acquire_impl!(p::DisabledPool, ::Type{T}, n::Int) where {T} = unsafe_acquire!(p, T, n) +@inline _unsafe_acquire_impl!(p::DisabledPool, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = unsafe_acquire!(p, T, dims...) +@inline _unsafe_acquire_impl!(p::DisabledPool, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = unsafe_acquire!(p, T, dims) +@inline _unsafe_acquire_impl!(p::DisabledPool, x::AbstractArray) = unsafe_acquire!(p, x) diff --git a/src/legacy/bitarray.jl b/src/legacy/bitarray.jl new file mode 100644 index 00000000..97731f68 --- /dev/null +++ b/src/legacy/bitarray.jl @@ -0,0 +1,205 @@ +# ============================================================================== +# BitArray Acquisition — Legacy (Julia ≤1.10, N-way Set-Associative Cache) +# ============================================================================== +# +# This file contains BitArray-specific pool operations for Julia ≤1.10. +# Uses N-way set-associative cache for N-D BitArray caching. +# +# Key components: +# - Base.zero/one(::Type{Bit}) - Fill value dispatch for Bit sentinel type +# - get_bitarray! - N-D BitArray with shared chunks and N-way caching +# - _acquire_impl! for Bit - Delegates to _unsafe_acquire_impl! for performance +# - _unsafe_acquire_impl! for Bit - Raw BitArray acquisition with caching +# - DisabledPool fallbacks for Bit type +# +# Design Decision: Unified BitArray Return Type +# ============================================= +# Unlike regular types where acquire! returns SubArray and unsafe_acquire! +# returns Array, for Bit type BOTH return BitArray{N}. This design choice is +# intentional for several reasons: +# +# 1. **SIMD Performance**: BitArray operations like `count()`, `sum()`, and +# bitwise operations are ~(10x ~ 100x) faster than their SubArray equivalents +# because they use SIMD-optimized chunked algorithms. +# +# 2. **API Simplicity**: Users always get BitArray regardless of which API +# they call. No need to remember "use unsafe_acquire! for performance". +# +# 3. **N-D Caching**: BitArray{N} can be reused by modifying dims/len fields +# when ndims matches, achieving 0 allocation on repeated calls. This is +# unique to BitArray - regular Array cannot modify dims in place. +# +# 4. **Backwards Compatibility**: Code using trues!/falses! just works with +# optimal performance - these convenience functions return BitVector. +# +# Implementation: +# - _acquire_impl!(pool, Bit, ...) delegates to _unsafe_acquire_impl! +# - get_bitarray! creates BitArray shells sharing pool's chunks +# - N-way cache stores BitArray{N} entries, reused via dims modification +# ============================================================================== + +# ============================================================================== +# Fill Value Dispatch (BitArray-specific) +# ============================================================================== + +# Bit type returns Bool element type for fill operations (zero/one) +@inline Base.zero(::Type{Bit}) = false +@inline Base.one(::Type{Bit}) = true + +# ============================================================================== +# BitArray Acquisition (N-way set-associative cache, Julia ≤1.10) +# ============================================================================== + +""" + get_bitarray!(tp::BitTypedPool, dims::NTuple{N,Int}) -> BitArray{N} + +Get a BitArray{N} that shares `chunks` with the pooled BitVector. + +Uses N-way set-associative cache with up to CACHE_WAYS patterns per slot. + +## Implementation Notes +- BitVector (N=1): `size()` uses `len` field, `dims` is ignored +- BitArray{N>1}: `size()` uses `dims` field +- All BitArrays share `chunks` with the pool's backing BitVector + +## Safety +The returned BitArray is only valid within the `@with_pool` scope. +Do NOT use after the scope ends (use-after-free risk). +""" +function get_bitarray!(tp::BitTypedPool, dims::NTuple{N,Int}) where {N} + total_len = safe_prod(dims) + tp.n_active += 1 + idx = tp.n_active + + # 1. Pool expansion needed (new slot) + if idx > length(tp.vectors) + pool_bv = BitVector(undef, total_len) + push!(tp.vectors, pool_bv) + + # Create BitArray sharing chunks + ba = BitArray{N}(undef, dims) + ba.chunks = pool_bv.chunks + + # Expand N-way cache (CACHE_WAYS entries per slot) + for _ in 1:CACHE_WAYS + push!(tp.nd_arrays, nothing) + push!(tp.nd_dims, nothing) + push!(tp.nd_ptrs, UInt(0)) + end + push!(tp.nd_next_way, 0) + + # Cache in first way + base = (idx - 1) * CACHE_WAYS + 1 + @inbounds tp.nd_arrays[base] = ba + @inbounds tp.nd_dims[base] = dims + @inbounds tp.nd_ptrs[base] = UInt(pointer(pool_bv.chunks)) + + # Warn at powers of 2 (possible missing rewind!) + if idx >= 512 && (idx & (idx - 1)) == 0 + total_bytes = sum(_vector_bytes, tp.vectors) + @warn "BitTypedPool growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?" + end + + return ba + end + + # 2. Ensure pool_bv has correct size + @inbounds pool_bv = tp.vectors[idx] + if length(pool_bv) != total_len + resize!(pool_bv, total_len) + end + current_ptr = UInt(pointer(pool_bv.chunks)) + base = (idx - 1) * CACHE_WAYS + + # 3. Check N-way cache for hit + for k in 1:CACHE_WAYS + cache_idx = base + k + @inbounds cached_dims = tp.nd_dims[cache_idx] + @inbounds cached_ptr = tp.nd_ptrs[cache_idx] + + # Must check isa FIRST for type stability (avoids boxing in == comparison) + if cached_dims isa NTuple{N,Int} && cached_ptr == current_ptr + if cached_dims == dims + # Exact match - return cached BitArray directly (0 alloc) + return @inbounds tp.nd_arrays[cache_idx]::BitArray{N} + else + # Same ndims but different dims - reuse by modifying fields (0 alloc!) + ba = @inbounds tp.nd_arrays[cache_idx]::BitArray{N} + ba.len = total_len + ba.dims = dims + ba.chunks = pool_bv.chunks + # Update cache metadata + @inbounds tp.nd_dims[cache_idx] = dims + return ba + end + end + end + + # 4. Cache miss - create new BitArray{N} + ba = BitArray{N}(undef, dims) + ba.chunks = pool_bv.chunks + + # Round-robin replacement + @inbounds way_offset = tp.nd_next_way[idx] + target_idx = base + way_offset + 1 + @inbounds tp.nd_arrays[target_idx] = ba + @inbounds tp.nd_dims[target_idx] = dims + @inbounds tp.nd_ptrs[target_idx] = current_ptr + @inbounds tp.nd_next_way[idx] = (way_offset + 1) % CACHE_WAYS + + return ba +end + +# Convenience: 1D case wraps to tuple +@inline get_bitarray!(tp::BitTypedPool, n::Int) = get_bitarray!(tp, (n,)) + +# ============================================================================== +# Acquire Implementation (Bit type → delegates to unsafe_acquire for performance) +# ============================================================================== + +# Bit type: delegates to _unsafe_acquire_impl! for SIMD performance +@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int) + return _unsafe_acquire_impl!(pool, Bit, n) +end + +@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N} + return _unsafe_acquire_impl!(pool, Bit, dims...) +end + +@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N} + return _unsafe_acquire_impl!(pool, Bit, dims...) +end + +# ============================================================================== +# Unsafe Acquire Implementation (Bit type) +# ============================================================================== + +# Bit type: returns BitArray{N} with shared chunks (SIMD optimized, N-D cached) +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int) + tp = get_typed_pool!(pool, Bit)::BitTypedPool + return get_bitarray!(tp, n) +end + +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N} + tp = get_typed_pool!(pool, Bit)::BitTypedPool + return get_bitarray!(tp, dims) +end + +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N} + tp = get_typed_pool!(pool, Bit)::BitTypedPool + return get_bitarray!(tp, dims) +end + +# ============================================================================== +# DisabledPool Fallbacks (Bit type) +# ============================================================================== + +# --- acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) --- +@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n) +@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims) +@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims) + +# --- unsafe_acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) --- +@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n) +@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims) +@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims) diff --git a/src/legacy/state.jl b/src/legacy/state.jl new file mode 100644 index 00000000..1b512104 --- /dev/null +++ b/src/legacy/state.jl @@ -0,0 +1,640 @@ +# ============================================================================== +# State Management — Legacy (Julia ≤1.10) +# ============================================================================== +# Identical to v1.11+ state.jl except empty! clears legacy N-way cache fields +# (nd_arrays, nd_dims, nd_ptrs, nd_next_way) instead of nd_wrappers. + +# ============================================================================== +# State Management - checkpoint! +# ============================================================================== + +""" + checkpoint!(pool::AdaptiveArrayPool) + +Save the current pool state (n_active counters) to internal stacks. + +This is called automatically by `@with_pool` and related macros. +After warmup, this function has **zero allocation**. + +See also: [`rewind!`](@ref), [`@with_pool`](@ref) +""" +function checkpoint!(pool::AdaptiveArrayPool) + + # Increment depth and initialize type touch tracking state + pool._current_depth += 1 + push!(pool._touched_type_masks, UInt16(0)) + push!(pool._touched_has_others, false) + depth = pool._current_depth + + # Fixed slots - zero allocation via @generated iteration + foreach_fixed_slot(pool) do tp + _checkpoint_typed_pool!(tp, depth) + end + + # Others - iterate without allocation (values() returns iterator) + for p in values(pool.others) + _checkpoint_typed_pool!(p, depth) + end + + return nothing +end + +""" + checkpoint!(pool::AdaptiveArrayPool, ::Type{T}) + +Save state for a specific type only. Used by optimized macros that know +which types will be used at compile time. + +Also updates _current_depth and bitmask state for type touch tracking. + +~77% faster than full checkpoint! when only one type is used. +""" +@inline function checkpoint!(pool::AdaptiveArrayPool, ::Type{T}) where T + + pool._current_depth += 1 + push!(pool._touched_type_masks, UInt16(0)) + # Push true when T is a fallback type (non-fixed-slot) so that + # _typed_lazy_rewind! iterates pool.others even if _acquire_impl! + # (which bypasses _record_type_touch!) is the only acquire path. + push!(pool._touched_has_others, _fixed_slot_bit(T) == UInt16(0)) + _checkpoint_typed_pool!(get_typed_pool!(pool, T), pool._current_depth) + nothing +end + +""" + checkpoint!(pool::AdaptiveArrayPool, types::Type...) + +Save state for multiple specific types. Uses @generated for zero-overhead +compile-time unrolling. Increments _current_depth once for all types. +""" +@generated function checkpoint!(pool::AdaptiveArrayPool, types::Type...) + + # Deduplicate types at compile time (e.g., Float64, Float64 → Float64) + seen = Set{Any}() + unique_indices = Int[] + for i in eachindex(types) + if !(types[i] in seen) + push!(seen, types[i]) + push!(unique_indices, i) + end + end + # Check at compile time if any type is a fallback (non-fixed-slot). + # If so, push has_others=true so _typed_lazy_rewind! iterates pool.others + # even when _acquire_impl! (bypassing _record_type_touch!) is used. + has_any_fallback = any(i -> _fixed_slot_bit(types[i].parameters[1]) == UInt16(0), unique_indices) + checkpoint_exprs = [:(_checkpoint_typed_pool!(get_typed_pool!(pool, types[$i]), pool._current_depth)) for i in unique_indices] + quote + pool._current_depth += 1 + push!(pool._touched_type_masks, UInt16(0)) + push!(pool._touched_has_others, $has_any_fallback) + $(checkpoint_exprs...) + nothing + end +end + +# Internal helper for checkpoint (works for any AbstractTypedPool) +@inline function _checkpoint_typed_pool!(tp::AbstractTypedPool, depth::Int) + + # Guard: skip if already checkpointed at this depth (prevents double-push + # when get_typed_pool! auto-checkpoints a new fallback type and then + # checkpoint!(pool, types...) calls _checkpoint_typed_pool! for the same type). + if @inbounds(tp._checkpoint_depths[end]) != depth + push!(tp._checkpoint_n_active, tp.n_active) + push!(tp._checkpoint_depths, depth) + end + nothing +end + +""" + _lazy_checkpoint!(pool::AdaptiveArrayPool) + +Lightweight checkpoint for lazy mode (`use_typed=false` macro path). + +Increments `_current_depth` and pushes bitmask sentinels — but does **not** save +`n_active` for any fixed-slot typed pool. The `_LAZY_MODE_BIT` (bit 15) in +`_touched_type_masks` marks this depth as lazy mode so that +`_record_type_touch!` can trigger lazy first-touch checkpoints. + +Existing `others` entries are eagerly checkpointed since there is no per-type +tracking for non-fixed-slot pools; Case B in `_rewind_typed_pool!` handles any +new `others` entries created during the scope (n_active starts at 0 = sentinel). + +Performance: ~2ns vs ~540ns for full `checkpoint!`. +""" +@inline function _lazy_checkpoint!(pool::AdaptiveArrayPool) + + pool._current_depth += 1 + # _LAZY_MODE_BIT = lazy mode flag (bits 0–7 are fixed-slot type bits) + push!(pool._touched_type_masks, _LAZY_MODE_BIT) + push!(pool._touched_has_others, false) + depth = pool._current_depth + # Eagerly checkpoint any pre-existing others entries. + # New others types created during the scope start at n_active=0 (sentinel covers them). + for p in values(pool.others) + _checkpoint_typed_pool!(p, depth) + @inbounds pool._touched_has_others[depth] = true + end + nothing +end + +# ============================================================================== +# State Management - rewind! +# ============================================================================== + +""" + rewind!(pool::AdaptiveArrayPool) + +Restore the pool state (n_active counters) from internal stacks. +Uses _checkpoint_depths to accurately determine which entries to pop vs restore. + +Only the counters are restored; allocated memory remains for reuse. +Handles touched types by checking _checkpoint_depths for accurate restoration. + +**Safety**: If called at global scope (depth=1, no pending checkpoints), +automatically delegates to `reset!` to safely clear all n_active counters. + +See also: [`checkpoint!`](@ref), [`reset!`](@ref), [`@with_pool`](@ref) +""" +function rewind!(pool::AdaptiveArrayPool) + + cur_depth = pool._current_depth + + # Safety guard: at global scope (depth=1), no checkpoint to rewind to + # Delegate to reset! which safely clears all n_active counters + if cur_depth == 1 + reset!(pool) + return nothing + end + + # Fixed slots - zero allocation via @generated iteration + foreach_fixed_slot(pool) do tp + _rewind_typed_pool!(tp, cur_depth) + end + + # Process fallback types + for tp in values(pool.others) + _rewind_typed_pool!(tp, cur_depth) + end + + pop!(pool._touched_type_masks) + pop!(pool._touched_has_others) + pool._current_depth -= 1 + + return nothing +end + +""" + rewind!(pool::AdaptiveArrayPool, ::Type{T}) + +Restore state for a specific type only. +Also updates _current_depth and bitmask state. +""" +@inline function rewind!(pool::AdaptiveArrayPool, ::Type{T}) where T + + # Safety guard: at global scope (depth=1), delegate to reset! + if pool._current_depth == 1 + reset!(get_typed_pool!(pool, T)) + return nothing + end + _rewind_typed_pool!(get_typed_pool!(pool, T), pool._current_depth) + pop!(pool._touched_type_masks) + pop!(pool._touched_has_others) + pool._current_depth -= 1 + nothing +end + +""" + rewind!(pool::AdaptiveArrayPool, types::Type...) + +Restore state for multiple specific types in reverse order. +Decrements _current_depth once after all types are rewound. +""" +@generated function rewind!(pool::AdaptiveArrayPool, types::Type...) + + # Deduplicate types at compile time (e.g., Float64, Float64 → Float64) + seen = Set{Any}() + unique_indices = Int[] + for i in eachindex(types) + if !(types[i] in seen) + push!(seen, types[i]) + push!(unique_indices, i) + end + end + rewind_exprs = [:(_rewind_typed_pool!(get_typed_pool!(pool, types[$i]), pool._current_depth)) for i in reverse(unique_indices)] + reset_exprs = [:(reset!(get_typed_pool!(pool, types[$i]))) for i in unique_indices] + quote + # Safety guard: at global scope (depth=1), delegate to reset! + if pool._current_depth == 1 + $(reset_exprs...) + return nothing + end + $(rewind_exprs...) + pop!(pool._touched_type_masks) + pop!(pool._touched_has_others) + pool._current_depth -= 1 + nothing + end +end + +# Internal helper for rewind with orphan cleanup (works for any AbstractTypedPool) +# Uses 1-based sentinel pattern: no isempty checks needed (sentinel [0] guarantees non-empty) +@inline function _rewind_typed_pool!(tp::AbstractTypedPool, current_depth::Int) + + # 1. Orphaned Checkpoints Cleanup + # If there are checkpoints from deeper scopes (depth > current), pop them first. + # This happens when a nested scope did full checkpoint but typed rewind, + # leaving orphaned checkpoints that must be cleaned before finding current state. + while @inbounds tp._checkpoint_depths[end] > current_depth + pop!(tp._checkpoint_depths) + pop!(tp._checkpoint_n_active) + end + + # 2. Normal Rewind Logic (Sentinel Pattern) + # Now the stack top is guaranteed to be at depth <= current depth. + if @inbounds tp._checkpoint_depths[end] == current_depth + # Checkpointed at current depth: pop and restore + pop!(tp._checkpoint_depths) + tp.n_active = pop!(tp._checkpoint_n_active) + else + # No checkpoint at current depth (this type was excluded from typed checkpoint) + # MUST restore n_active from parent checkpoint value! + # - Untracked acquire may have modified n_active + # - If sentinel (_checkpoint_n_active=[0]), restores to n_active=0 + tp.n_active = @inbounds tp._checkpoint_n_active[end] + end + nothing +end + +""" + _lazy_rewind!(pool::AdaptiveArrayPool) + +Complete rewind for lazy mode (`use_typed=false` macro path). + +Reads the combined mask at the current depth, rewinds only the fixed-slot pools +whose bits are set, handles any `others` entries, then pops the depth metadata. + +Called directly from the macro-generated `finally` clause as a single function call +(matching the structure of `_lazy_checkpoint!` for symmetry and performance). +""" +@inline function _lazy_rewind!(pool::AdaptiveArrayPool) + + d = pool._current_depth + bits = @inbounds(pool._touched_type_masks[d]) & _TYPE_BITS_MASK + _selective_rewind_fixed_slots!(pool, bits) + if @inbounds(pool._touched_has_others[d]) + for tp in values(pool.others) + _rewind_typed_pool!(tp, d) + end + end + pop!(pool._touched_type_masks) + pop!(pool._touched_has_others) + pool._current_depth -= 1 + nothing +end + +""" + _typed_lazy_checkpoint!(pool::AdaptiveArrayPool, types::Type...) + +Typed checkpoint that enables lazy first-touch checkpointing for extra types touched +by helpers (`use_typed=true`, `_can_use_typed_path=false` path). + +Calls `checkpoint!(pool, types...)` (checkpoints only the statically-known types), +then sets `_TYPED_LAZY_BIT` (bit 14) in `_touched_type_masks[depth]` to signal typed lazy mode. + +`_record_type_touch!` checks `(mask & _MODE_BITS_MASK) != 0` (bit 14 OR bit 15) to trigger a +lazy first-touch checkpoint for each extra type on first acquire, ensuring Case A +(not Case B) applies at rewind and parent `n_active` is preserved correctly. +""" +@inline function _typed_lazy_checkpoint!(pool::AdaptiveArrayPool, types::Type...) + checkpoint!(pool, types...) + d = pool._current_depth + @inbounds pool._touched_type_masks[d] |= _TYPED_LAZY_BIT + + # Eagerly snapshot pre-existing others entries — mirrors _lazy_checkpoint!. + # _record_type_touch! cannot lazy-checkpoint others types (b==0 branch, no per-type bit). + # Without this, a helper that re-acquires an already-active others type triggers Case B + # at rewind and restores the wrong parent n_active value. + # + # Also set has_others=true when pool.others is non-empty, so _typed_lazy_rewind! + # enters the others loop even for tracked non-fixed-slot types (e.g. CPU Float16) that + # used _acquire_impl! (bypassing _record_type_touch!, leaving has_others=false otherwise). + # Skip re-snapshot for entries already checkpointed at d by checkpoint!(pool, types...) + # (e.g. Float16 in types... was just checkpointed above — avoid double-push). + for p in values(pool.others) + if @inbounds(p._checkpoint_depths[end]) != d + _checkpoint_typed_pool!(p, d) + end + @inbounds pool._touched_has_others[d] = true + end + nothing +end + +""" + _typed_lazy_rewind!(pool::AdaptiveArrayPool, tracked_mask::UInt16) + +Selective rewind for typed mode (`use_typed=true`) fallback path. + +Called when `_can_use_typed_path` returns false (helpers touched types beyond the +statically-tracked set). Rewinds only pools whose bits are set in +`tracked_mask | touched_mask`. All touched types have Case A checkpoints, +guaranteed by the `_TYPED_LAZY_BIT` mode set in `_typed_lazy_checkpoint!`. +""" +@inline function _typed_lazy_rewind!(pool::AdaptiveArrayPool, tracked_mask::UInt16) + + d = pool._current_depth + touched = @inbounds(pool._touched_type_masks[d]) & _TYPE_BITS_MASK + combined = tracked_mask | touched + _selective_rewind_fixed_slots!(pool, combined) + if @inbounds(pool._touched_has_others[d]) + for tp in values(pool.others) + _rewind_typed_pool!(tp, d) + end + end + pop!(pool._touched_type_masks) + pop!(pool._touched_has_others) + pool._current_depth -= 1 + nothing +end + +""" + _selective_rewind_fixed_slots!(pool::AdaptiveArrayPool, mask::UInt16) + +Rewind only the fixed-slot typed pools whose bits are set in `mask`. + +Each of the 8 fixed-slot pools maps to bits 0–7 (same encoding as `_fixed_slot_bit`). +Bits 8–15 (mode flags) are **not** checked here — callers must strip them +before passing the mask (e.g. `mask & _TYPE_BITS_MASK`). +""" +@inline function _selective_rewind_fixed_slots!(pool::AdaptiveArrayPool, mask::UInt16) + + d = pool._current_depth + _has_bit(mask, Float64) && _rewind_typed_pool!(pool.float64, d) + _has_bit(mask, Float32) && _rewind_typed_pool!(pool.float32, d) + _has_bit(mask, Int64) && _rewind_typed_pool!(pool.int64, d) + _has_bit(mask, Int32) && _rewind_typed_pool!(pool.int32, d) + _has_bit(mask, ComplexF64) && _rewind_typed_pool!(pool.complexf64, d) + _has_bit(mask, ComplexF32) && _rewind_typed_pool!(pool.complexf32, d) + _has_bit(mask, Bool) && _rewind_typed_pool!(pool.bool, d) + _has_bit(mask, Bit) && _rewind_typed_pool!(pool.bits, d) + nothing +end + +# ============================================================================== +# State Management - empty! (Legacy: N-way cache fields) +# ============================================================================== + +""" + empty!(tp::BitTypedPool) + +Clear all internal storage for BitTypedPool, releasing all memory. +Restores sentinel values for 1-based sentinel pattern. +""" +function Base.empty!(tp::BitTypedPool) + empty!(tp.vectors) + # Clear N-way cache + empty!(tp.nd_arrays) + empty!(tp.nd_dims) + empty!(tp.nd_ptrs) + empty!(tp.nd_next_way) + tp.n_active = 0 + # Restore sentinel values (1-based sentinel pattern) + empty!(tp._checkpoint_n_active) + push!(tp._checkpoint_n_active, 0) # Sentinel: n_active=0 at depth=0 + empty!(tp._checkpoint_depths) + push!(tp._checkpoint_depths, 0) # Sentinel: depth=0 = no checkpoint + return tp +end + +""" + empty!(tp::TypedPool) + +Clear all internal storage for TypedPool, releasing all memory. +Restores sentinel values for 1-based sentinel pattern. +""" +function Base.empty!(tp::TypedPool) + empty!(tp.vectors) + empty!(tp.views) + empty!(tp.view_lengths) + # Clear N-way cache + empty!(tp.nd_arrays) + empty!(tp.nd_dims) + empty!(tp.nd_ptrs) + empty!(tp.nd_next_way) + tp.n_active = 0 + # Restore sentinel values (1-based sentinel pattern) + empty!(tp._checkpoint_n_active) + push!(tp._checkpoint_n_active, 0) # Sentinel: n_active=0 at depth=0 + empty!(tp._checkpoint_depths) + push!(tp._checkpoint_depths, 0) # Sentinel: depth=0 = no checkpoint + return tp +end + +""" + empty!(pool::AdaptiveArrayPool) + +Completely clear the pool, releasing all stored vectors and resetting all state. + +This is useful when you want to free memory or start fresh without creating +a new pool instance. + +## Example +```julia +pool = AdaptiveArrayPool() +v = acquire!(pool, Float64, 1000) +# ... use v ... +empty!(pool) # Release all memory +``` + +## Warning +Any SubArrays previously acquired from this pool become invalid after `empty!`. +""" +function Base.empty!(pool::AdaptiveArrayPool) + # Fixed slots - zero allocation via @generated iteration + foreach_fixed_slot(pool) do tp + empty!(tp) + end + + # Others - clear all TypedPools then the IdDict itself + for tp in values(pool.others) + empty!(tp) + end + empty!(pool.others) + + # Reset type touch tracking state (1-based sentinel pattern) + pool._current_depth = 1 # 1 = global scope (sentinel) + empty!(pool._touched_type_masks) + push!(pool._touched_type_masks, UInt16(0)) # Sentinel: no bits set + empty!(pool._touched_has_others) + push!(pool._touched_has_others, false) # Sentinel: no others + + return pool +end + +# ============================================================================== +# State Management - reset! +# ============================================================================== + +""" + reset!(tp::AbstractTypedPool) + +Reset state without clearing allocated storage. +Sets `n_active = 0` and restores checkpoint stacks to sentinel state. +""" +function reset!(tp::AbstractTypedPool) + tp.n_active = 0 + # Restore sentinel values (1-based sentinel pattern) + empty!(tp._checkpoint_n_active) + push!(tp._checkpoint_n_active, 0) # Sentinel: n_active=0 at depth=0 + empty!(tp._checkpoint_depths) + push!(tp._checkpoint_depths, 0) # Sentinel: depth=0 = no checkpoint + return tp +end + +""" + reset!(pool::AdaptiveArrayPool) + +Reset pool state without clearing allocated storage. + +This function: +- Resets all `n_active` counters to 0 +- Restores all checkpoint stacks to sentinel state +- Resets `_current_depth` and type touch tracking state + +Unlike `empty!`, this **preserves** all allocated vectors, views, and N-D arrays +for reuse, avoiding reallocation costs. + +## Use Case +When functions that acquire from the pool are called without proper +`checkpoint!/rewind!` management, `n_active` can grow indefinitely. +Use `reset!` to cleanly restore the pool to its initial state while +keeping allocated memory available. + +## Example +```julia +pool = AdaptiveArrayPool() + +# Some function that acquires without checkpoint management +function compute!(pool) + v = acquire!(pool, Float64, 100) + # ... use v ... + # No rewind! called +end + +for _ in 1:1000 + compute!(pool) # n_active grows each iteration +end + +reset!(pool) # Restore state, keep allocated memory +# Now pool.n_active == 0, but vectors are still available for reuse +``` + +See also: [`empty!`](@ref), [`rewind!`](@ref) +""" +function reset!(pool::AdaptiveArrayPool) + # Fixed slots - zero allocation via @generated iteration + foreach_fixed_slot(pool) do tp + reset!(tp) + end + + # Others - reset all TypedPools + for tp in values(pool.others) + reset!(tp) + end + + # Reset type touch tracking state (1-based sentinel pattern) + pool._current_depth = 1 # 1 = global scope (sentinel) + empty!(pool._touched_type_masks) + push!(pool._touched_type_masks, UInt16(0)) # Sentinel: no bits set + empty!(pool._touched_has_others) + push!(pool._touched_has_others, false) # Sentinel: no others + + return pool +end + +""" + reset!(pool::AdaptiveArrayPool, ::Type{T}) + +Reset state for a specific type only. Clears n_active and checkpoint stacks +to sentinel state while preserving allocated vectors. + +See also: [`reset!(::AdaptiveArrayPool)`](@ref), [`rewind!`](@ref) +""" +@inline function reset!(pool::AdaptiveArrayPool, ::Type{T}) where T + reset!(get_typed_pool!(pool, T)) + pool +end + +""" + reset!(pool::AdaptiveArrayPool, types::Type...) + +Reset state for multiple specific types. Uses @generated for zero-overhead +compile-time unrolling. + +See also: [`reset!(::AdaptiveArrayPool)`](@ref), [`rewind!`](@ref) +""" +@generated function reset!(pool::AdaptiveArrayPool, types::Type...) + reset_exprs = [:(reset!(get_typed_pool!(pool, types[$i]))) for i in 1:length(types)] + quote + $(reset_exprs...) + pool + end +end + +# ============================================================================== +# Bitmask Helpers for Typed Path Decisions +# ============================================================================== + +""" + _tracked_mask_for_types(types::Type...) -> UInt16 + +Compute compile-time bitmask for the types tracked by a typed checkpoint/rewind. +Uses `@generated` for zero-overhead constant folding. + +Returns `UInt16(0)` when called with no arguments. +Non-fixed-slot types contribute `UInt16(0)` (their bit is 0). +""" +@generated function _tracked_mask_for_types(types::Type...) + mask = UInt16(0) + for i in 1:length(types) + T = types[i].parameters[1] + mask |= _fixed_slot_bit(T) + end + return :(UInt16($mask)) +end + +""" + _can_use_typed_path(pool::AbstractArrayPool, tracked_mask::UInt16) -> Bool + +Check if the typed (fast) checkpoint/rewind path is safe to use. + +Returns `true` when all touched types at the current depth are a subset +of the tracked types (bitmask subset check) AND no non-fixed-slot types were touched. + +The subset check: `(touched_mask & ~tracked_mask) == 0` means every bit set +in `touched_mask` is also set in `tracked_mask`. +""" +@inline function _can_use_typed_path(pool::AbstractArrayPool, tracked_mask::UInt16) + depth = pool._current_depth + touched_mask = @inbounds(pool._touched_type_masks[depth]) & _TYPE_BITS_MASK + has_others = @inbounds pool._touched_has_others[depth] + return (touched_mask & ~tracked_mask) == UInt16(0) && !has_others +end + +# ============================================================================== +# DisabledPool State Management (no-ops) +# ============================================================================== +# DisabledPool doesn't track state, so all operations are no-ops. + +checkpoint!(::DisabledPool) = nothing +checkpoint!(::DisabledPool, ::Type) = nothing +checkpoint!(::DisabledPool, types::Type...) = nothing + +rewind!(::DisabledPool) = nothing +rewind!(::DisabledPool, ::Type) = nothing +rewind!(::DisabledPool, types::Type...) = nothing + +reset!(::DisabledPool) = nothing +reset!(::DisabledPool, ::Type) = nothing +reset!(::DisabledPool, types::Type...) = nothing + +Base.empty!(::DisabledPool) = nothing diff --git a/src/legacy/types.jl b/src/legacy/types.jl new file mode 100644 index 00000000..b056ddbb --- /dev/null +++ b/src/legacy/types.jl @@ -0,0 +1,447 @@ +# ============================================================================== +# Constants (Configurable via Preferences) — Legacy (Julia ≤1.10) +# ============================================================================== + +using Preferences + +""" +Number of cache ways per slot for N-way set associative cache. +Supports up to `CACHE_WAYS` different dimension patterns per slot without thrashing. + +Default: 4 (handles most use cases well) + +## Configuration +```julia +using AdaptiveArrayPools +AdaptiveArrayPools.set_cache_ways!(8) # Restart Julia to take effect +``` + +Or manually in `LocalPreferences.toml`: +```toml +[AdaptiveArrayPools] +cache_ways = 8 +``` + +Valid range: 1-16 (higher values increase memory but reduce eviction) +""" +const CACHE_WAYS = let + ways = @load_preference("cache_ways", 4)::Int + if ways < 1 || ways > 16 + @warn "CACHE_WAYS=$ways out of range [1,16], using default 4" + 4 + else + ways + end +end + +""" + set_cache_ways!(n::Int) + +Set the number of cache ways for N-D array caching. +**Requires Julia restart to take effect.** + +Higher values reduce cache eviction but increase memory usage per slot. + +## Arguments +- `n::Int`: Number of cache ways (valid range: 1-16) + +## Example +```julia +using AdaptiveArrayPools +AdaptiveArrayPools.set_cache_ways!(8) # Double the default +# Restart Julia to apply the change +``` +""" +function set_cache_ways!(n::Int) + if n < 1 || n > 16 + throw(ArgumentError("cache_ways must be in range [1, 16], got $n")) + end + @set_preferences!("cache_ways" => n) + @info "CACHE_WAYS set to $n. Restart Julia to apply." + return n +end + +# ============================================================================== +# Abstract Type Hierarchy (for extensibility) +# ============================================================================== + +""" + AbstractTypedPool{T, V<:AbstractVector{T}} + +Abstract base for type-specific memory pools. +""" +abstract type AbstractTypedPool{T, V<:AbstractVector{T}} end + +""" + AbstractArrayPool + +Abstract base for multi-type array pools. +""" +abstract type AbstractArrayPool end + +# ============================================================================== +# Disabled Pool Sentinel Types +# ============================================================================== + +""" + DisabledPool{Backend} + +Sentinel type for disabled pooling that preserves backend context. +When `USE_POOLING=false` (compile-time) or `MAYBE_POOLING_ENABLED[]=false` (runtime), +macros return `DisabledPool{backend}()` instead of `nothing`. + +Backend symbols: +- `:cpu` - Standard Julia arrays +- `:cuda` - CUDA.jl CuArrays (defined in extension) + +This enables `@with_pool :cuda` to return correct array types even when pooling is off. + +## Example +```julia +# When USE_POOLING=false: +@with_pool :cuda pool begin + v = zeros!(pool, 10) # Returns CuArray{Float32}, not Array{Float64}! +end +``` + +See also: [`pooling_enabled`](@ref), [`DISABLED_CPU`](@ref) +""" +struct DisabledPool{Backend} end + +""" + DISABLED_CPU + +Singleton instance for disabled CPU pooling. +Used by macros when `USE_POOLING=false` without backend specification. +""" +const DISABLED_CPU = DisabledPool{:cpu}() + +""" + pooling_enabled(pool) -> Bool + +Returns `true` if `pool` is an active pool, `false` if pooling is disabled. + +## Examples +```julia +@maybe_with_pool pool begin + if pooling_enabled(pool) + # Using pooled memory + else + # Using standard allocation + end +end +``` + +See also: [`DisabledPool`](@ref) +""" +pooling_enabled(::AbstractArrayPool) = true +pooling_enabled(::DisabledPool) = false + +# ============================================================================== +# Core Data Structures — Legacy (Julia ≤1.10, N-way set-associative cache) +# ============================================================================== + +# 1-Based Sentinel Pattern: Arrays start with sentinel values to eliminate +# isempty() checks in hot paths. See docstrings for details. + +""" + TypedPool{T} <: AbstractTypedPool{T, Vector{T}} + +Internal structure managing pooled vectors for a specific element type `T`. + +## Fields + +### Storage +- `vectors`: Backing `Vector{T}` storage (actual memory allocation) + +### 1D Cache (for `acquire!(pool, T, n)`) +- `views`: Cached `SubArray` views for zero-allocation 1D access +- `view_lengths`: Cached lengths for fast Int comparison (SoA pattern) + +### N-D Array Cache (N-way set-associative, Julia ≤1.10) +- `nd_arrays`: Cached N-D `Array` objects (length = slots × CACHE_WAYS) +- `nd_dims`: Cached dimension tuples for cache hit validation +- `nd_ptrs`: Cached pointer values to detect backing vector resize +- `nd_next_way`: Round-robin counter per slot (length = slots) + +### State Management (1-based sentinel pattern) +- `n_active`: Count of currently active (checked-out) arrays +- `_checkpoint_n_active`: Saved n_active values at each checkpoint (sentinel: `[0]`) +- `_checkpoint_depths`: Depth of each checkpoint entry (sentinel: `[0]`) +""" +mutable struct TypedPool{T} <: AbstractTypedPool{T, Vector{T}} + # --- Storage --- + vectors::Vector{Vector{T}} + + # --- 1D Cache (1:1 mapping) --- + views::Vector{SubArray{T, 1, Vector{T}, Tuple{UnitRange{Int64}}, true}} + view_lengths::Vector{Int} + + # --- N-D Array Cache (N-way set associative) --- + nd_arrays::Vector{Any} # length = slots × CACHE_WAYS + nd_dims::Vector{Any} # dimension tuples + nd_ptrs::Vector{UInt} # pointer validation + nd_next_way::Vector{Int} # round-robin counter per slot + + # --- State Management (1-based sentinel pattern) --- + n_active::Int + _checkpoint_n_active::Vector{Int} # Saved n_active at each checkpoint + _checkpoint_depths::Vector{Int} # Depth of each checkpoint +end + +TypedPool{T}() where {T} = TypedPool{T}( + # Storage + Vector{T}[], + # 1D Cache + SubArray{T, 1, Vector{T}, Tuple{UnitRange{Int64}}, true}[], + Int[], + # N-D Array Cache + Any[], Any[], UInt[], Int[], + # State Management (1-based sentinel pattern: guaranteed non-empty) + 0, # n_active + [0], # _checkpoint_n_active: sentinel (n_active=0 at depth=0) + [0] # _checkpoint_depths: sentinel (depth=0 = no checkpoint) +) + +# ============================================================================== +# Bit Sentinel Type +# ============================================================================== + +""" + Bit + +Sentinel type for bit-packed boolean storage via `BitVector`. + +Use `Bit` instead of `Bool` in pool operations to get memory-efficient +bit-packed arrays (1 bit per element vs 1 byte for `Vector{Bool}`). + +## Usage +```julia +@with_pool pool begin + # BitVector (1 bit per element, ~8x memory savings) + bv = acquire!(pool, Bit, 1000) + + # vs Vector{Bool} (1 byte per element) + vb = acquire!(pool, Bool, 1000) + + # Convenience functions work too + mask = falses!(pool, 100) # BitVector filled with false + flags = trues!(pool, 100) # BitVector filled with true +end +``` + +## Return Types (Unified for Performance) +Unlike other types, `Bit` always returns native `BitVector`/`BitArray`: +- **1D**: `BitVector` (both `acquire!` and `unsafe_acquire!`) +- **N-D**: `BitArray{N}` (reshaped, preserves SIMD optimization) + +This design ensures users always get SIMD-optimized performance without +needing to remember which API to use. + +## Performance +`BitVector` operations like `count()`, `sum()`, and bitwise operations are +~(10x ~ 100x) faster than equivalent operations on `SubArray{Bool}` because they +use SIMD-optimized algorithms on packed 64-bit chunks. + +```julia +@with_pool pool begin + bv = acquire!(pool, Bit, 10000) + fill!(bv, true) + count(bv) # Uses fast SIMD path automatically +end +``` + +## Memory Safety +The returned `BitVector` shares its internal `chunks` array with the pool. +It is only valid within the `@with_pool` scope - using it after the scope +ends leads to undefined behavior (use-after-free risk). + +See also: [`trues!`](@ref), [`falses!`](@ref), [`BitTypedPool`](@ref) +""" +struct Bit end + +# ============================================================================== +# BitTypedPool — Legacy (Julia ≤1.10, N-way set-associative cache) +# ============================================================================== + +""" + BitTypedPool <: AbstractTypedPool{Bool, BitVector} + +Specialized pool for `BitVector` arrays with memory reuse. + +## Fields +- `vectors`: Backing `BitVector` storage +- `nd_arrays`, `nd_dims`, `nd_ptrs`, `nd_next_way`: N-way cache (Julia ≤1.10) +- `n_active`: Count of currently active arrays +- `_checkpoint_*`: State management stacks (1-based sentinel pattern) + +See also: [`trues!`](@ref), [`falses!`](@ref), [`Bit`](@ref) +""" +mutable struct BitTypedPool <: AbstractTypedPool{Bool, BitVector} + # --- Storage --- + vectors::Vector{BitVector} + + # --- N-D BitArray Cache (N-way set associative) --- + nd_arrays::Vector{Any} # Cached BitArray{N} instances + nd_dims::Vector{Any} # Cached dims (NTuple{N,Int}) + nd_ptrs::Vector{UInt} # pointer validation + nd_next_way::Vector{Int} # round-robin counter per slot + + # --- State Management (1-based sentinel pattern) --- + n_active::Int + _checkpoint_n_active::Vector{Int} + _checkpoint_depths::Vector{Int} +end + +BitTypedPool() = BitTypedPool( + # Storage + BitVector[], + # N-D BitArray Cache + Any[], Any[], UInt[], Int[], + # State Management (1-based sentinel pattern) + 0, # n_active + [0], # _checkpoint_n_active: sentinel + [0] # _checkpoint_depths: sentinel +) + +# ============================================================================== +# Fixed Slot Configuration +# ============================================================================== + +""" + FIXED_SLOT_FIELDS + +Field names for fixed slot TypedPools. Single source of truth for `foreach_fixed_slot`. + +When modifying, also update: struct definition, `get_typed_pool!` dispatches, constructor. +Tests verify synchronization automatically. +""" +const FIXED_SLOT_FIELDS = (:float64, :float32, :int64, :int32, :complexf64, :complexf32, :bool, :bits) + +# ============================================================================== +# Bitmask Mode Constants +# ============================================================================== +# Bits 0-7: fixed-slot type touch tracking (one bit per type) +# Bits 14-15: mode flags set during checkpoint to control lazy behavior + +const _LAZY_MODE_BIT = UInt16(0x8000) # bit 15: lazy (dynamic-selective) checkpoint mode +const _TYPED_LAZY_BIT = UInt16(0x4000) # bit 14: typed lazy-fallback mode +const _MODE_BITS_MASK = UInt16(0xC000) # bits 14-15: all mode flags +const _TYPE_BITS_MASK = UInt16(0x00FF) # bits 0-7: fixed-slot type bits + +# ============================================================================== +# Fixed-Slot Bit Mapping (for type touch tracking) +# ============================================================================== +# Maps each fixed-slot type to a unique bit in a UInt16 bitmask. +# Bit ordering matches FIXED_SLOT_FIELDS. Non-fixed types return UInt16(0). + +@inline _fixed_slot_bit(::Type{Float64}) = UInt16(1) << 0 +@inline _fixed_slot_bit(::Type{Float32}) = UInt16(1) << 1 +@inline _fixed_slot_bit(::Type{Int64}) = UInt16(1) << 2 +@inline _fixed_slot_bit(::Type{Int32}) = UInt16(1) << 3 +@inline _fixed_slot_bit(::Type{ComplexF64}) = UInt16(1) << 4 +@inline _fixed_slot_bit(::Type{ComplexF32}) = UInt16(1) << 5 +@inline _fixed_slot_bit(::Type{Bool}) = UInt16(1) << 6 +@inline _fixed_slot_bit(::Type{Bit}) = UInt16(1) << 7 +@inline _fixed_slot_bit(::Type) = UInt16(0) # non-fixed-slot → triggers has_others + +# Check whether a type's bit is set in a bitmask (e.g. _touched_type_masks or combined). +@inline _has_bit(mask::UInt16, ::Type{T}) where {T} = (mask & _fixed_slot_bit(T)) != 0 + +# ============================================================================== +# AdaptiveArrayPool +# ============================================================================== + +""" + AdaptiveArrayPool + +Multi-type memory pool with fixed slots for common types and IdDict fallback for others. +Zero allocation after warmup. NOT thread-safe - use one pool per Task. +""" +mutable struct AdaptiveArrayPool <: AbstractArrayPool + # Fixed Slots: common types with zero lookup overhead + float64::TypedPool{Float64} + float32::TypedPool{Float32} + int64::TypedPool{Int64} + int32::TypedPool{Int32} + complexf64::TypedPool{ComplexF64} + complexf32::TypedPool{ComplexF32} + bool::TypedPool{Bool} + bits::BitTypedPool # BitVector pool (1 bit per element) + + # Fallback: rare types + others::IdDict{DataType, Any} + + # Type touch tracking (1-based sentinel pattern) + _current_depth::Int # Current scope depth (1 = global scope) + _touched_type_masks::Vector{UInt16} # Per-depth: which fixed slots were touched + mode flags + _touched_has_others::Vector{Bool} # Per-depth: any non-fixed-slot type touched? +end + +function AdaptiveArrayPool() + AdaptiveArrayPool( + TypedPool{Float64}(), + TypedPool{Float32}(), + TypedPool{Int64}(), + TypedPool{Int32}(), + TypedPool{ComplexF64}(), + TypedPool{ComplexF32}(), + TypedPool{Bool}(), + BitTypedPool(), + IdDict{DataType, Any}(), + 1, # _current_depth: 1 = global scope (sentinel) + [UInt16(0)], # _touched_type_masks: sentinel (no bits set) + [false] # _touched_has_others: sentinel (no others) + ) +end + +# ============================================================================== +# Type Dispatch (Zero-cost for Fixed Slots) +# ============================================================================== + +# Fast Path: compile-time dispatch, fully inlined +@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{Float64}) = p.float64 +@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{Float32}) = p.float32 +@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{Int64}) = p.int64 +@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{Int32}) = p.int32 +@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{ComplexF64}) = p.complexf64 +@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{ComplexF32}) = p.complexf32 +@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{Bool}) = p.bool +@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{Bit}) = p.bits + +# Slow Path: rare types via IdDict +@inline function get_typed_pool!(p::AdaptiveArrayPool, ::Type{T}) where {T} + get!(p.others, T) do + tp = TypedPool{T}() + # If inside a checkpoint scope (_current_depth > 1 means inside @with_pool), + # auto-checkpoint the new pool to prevent issues on rewind + if p._current_depth > 1 + push!(tp._checkpoint_n_active, 0) # n_active starts at 0 + push!(tp._checkpoint_depths, p._current_depth) + # Signal that a fallback type was touched so lazy/typed-lazy rewind + # iterates pool.others. Without this, _acquire_impl! (which bypasses + # _record_type_touch!) would leave has_others=false, causing the + # rewind to skip pool.others entirely and leak this new type's n_active. + @inbounds p._touched_has_others[p._current_depth] = true + end + tp + end::TypedPool{T} +end + +# ============================================================================== +# Zero-Allocation Iteration +# ============================================================================== + +""" + foreach_fixed_slot(f, pool::AdaptiveArrayPool) + +Apply `f` to each fixed slot TypedPool. Zero allocation via compile-time unrolling. +""" +@generated function foreach_fixed_slot(f::F, pool::AdaptiveArrayPool) where {F} + exprs = [:(f(getfield(pool, $(QuoteNode(field))))) for field in FIXED_SLOT_FIELDS] + quote + Base.@_inline_meta + $(exprs...) + nothing + end +end diff --git a/src/state.jl b/src/state.jl index 79a20a1c..b591b520 100644 --- a/src/state.jl +++ b/src/state.jl @@ -388,11 +388,7 @@ Restores sentinel values for 1-based sentinel pattern. """ function Base.empty!(tp::BitTypedPool) empty!(tp.vectors) - # Clear N-way wrapper cache - empty!(tp.nd_arrays) - empty!(tp.nd_dims) - empty!(tp.nd_ptrs) - empty!(tp.nd_next_way) + empty!(tp.nd_wrappers) tp.n_active = 0 # Restore sentinel values (1-based sentinel pattern) empty!(tp._checkpoint_n_active) @@ -412,11 +408,7 @@ function Base.empty!(tp::TypedPool) empty!(tp.vectors) empty!(tp.views) empty!(tp.view_lengths) - # Clear N-D Array cache (N-way) - empty!(tp.nd_arrays) - empty!(tp.nd_dims) - empty!(tp.nd_ptrs) - empty!(tp.nd_next_way) + empty!(tp.nd_wrappers) tp.n_active = 0 # Restore sentinel values (1-based sentinel pattern) empty!(tp._checkpoint_n_active) diff --git a/src/types.jl b/src/types.jl index 8ccd2c41..c2a01c9b 100644 --- a/src/types.jl +++ b/src/types.jl @@ -158,11 +158,10 @@ Internal structure managing pooled vectors for a specific element type `T`. - `views`: Cached `SubArray` views for zero-allocation 1D access - `view_lengths`: Cached lengths for fast Int comparison (SoA pattern) -### N-D Array Cache (for `unsafe_acquire!` only, N-way set associative) -- `nd_arrays`: Cached N-D `Array` objects (length = slots × CACHE_WAYS) -- `nd_dims`: Cached dimension tuples for cache hit validation -- `nd_ptrs`: Cached pointer values to detect backing vector resize -- `nd_next_way`: Round-robin counter per slot (length = slots) +### N-D Wrapper Cache (Julia 1.11+, setfield!-based reuse) +- `nd_wrappers`: `Vector{Union{Nothing, Vector{Any}}}` — indexed by N (dimensionality), + each entry is a per-slot cached `Array{T,N}` wrapper. Uses `setfield!(wrapper, :size, dims)` + and `setfield!(wrapper, :ref, parent)` for zero-allocation reuse of unlimited dim patterns. ### State Management (1-based sentinel pattern) - `n_active`: Count of currently active (checked-out) arrays @@ -171,7 +170,7 @@ Internal structure managing pooled vectors for a specific element type `T`. ## Note `acquire!` for N-D returns `ReshapedArray` (zero creation cost), so no caching needed. -Only `unsafe_acquire!` benefits from N-D caching since `unsafe_wrap` allocates 112 bytes. +`unsafe_acquire!` uses `setfield!` wrapper reuse — unlimited dim patterns, 0-alloc after warmup. """ mutable struct TypedPool{T} <: AbstractTypedPool{T, Vector{T}} # --- Storage --- @@ -181,11 +180,8 @@ mutable struct TypedPool{T} <: AbstractTypedPool{T, Vector{T}} views::Vector{SubArray{T, 1, Vector{T}, Tuple{UnitRange{Int64}}, true}} view_lengths::Vector{Int} - # --- N-D Array Cache (N-way set associative) --- - nd_arrays::Vector{Any} # length = slots × CACHE_WAYS - nd_dims::Vector{Any} # dimension tuples - nd_ptrs::Vector{UInt} # pointer validation - nd_next_way::Vector{Int} # round-robin counter per slot + # --- N-D Wrapper Cache (setfield!-based reuse) --- + nd_wrappers::Vector{Union{Nothing, Vector{Any}}} # index=N (dimensionality), value=per-slot Array{T,N} # --- State Management (1-based sentinel pattern) --- n_active::Int @@ -199,11 +195,8 @@ TypedPool{T}() where {T} = TypedPool{T}( # 1D Cache SubArray{T, 1, Vector{T}, Tuple{UnitRange{Int64}}, true}[], Int[], - # N-D Array Cache (N-way) - Any[], - Any[], - UInt[], - Int[], + # N-D Wrapper Cache + Union{Nothing, Vector{Any}}[], # State Management (1-based sentinel pattern: guaranteed non-empty) 0, # n_active [0], # _checkpoint_n_active: sentinel (n_active=0 at depth=0) @@ -291,26 +284,10 @@ performance without needing to choose between APIs. ## Fields - `vectors`: Backing `BitVector` storage -- `nd_arrays`: Cached wrapper BitVectors (chunks sharing) -- `nd_dims`: Cached lengths for wrapper cache validation -- `nd_ptrs`: Cached chunk pointers for invalidation detection -- `nd_next_way`: Round-robin counter for N-way cache +- `nd_wrappers`: `Vector{Union{Nothing, Vector{Any}}}` — setfield!-based cache (Julia 1.11+) - `n_active`: Count of currently active arrays - `_checkpoint_*`: State management stacks (1-based sentinel pattern) -## Usage -```julia -@with_pool pool begin - # All return BitVector with SIMD performance - bv = acquire!(pool, Bit, 100) # BitVector - count(bv) # Fast SIMD path - - # Convenience functions - t = trues!(pool, 50) # BitVector filled with true - f = falses!(pool, 50) # BitVector filled with false -end -``` - ## Performance Operations like `count()`, `sum()`, and bitwise operations are ~(10x ~ 100x) faster than equivalent operations on `SubArray{Bool}` because `BitVector` uses @@ -322,15 +299,8 @@ mutable struct BitTypedPool <: AbstractTypedPool{Bool, BitVector} # --- Storage --- vectors::Vector{BitVector} - # --- N-D BitArray Cache (N-way set associative) --- - # Unlike TypedPool which uses views for 1D and nd_* for N-D, - # BitTypedPool uses nd_* for ALL dimensions (1D, 2D, 3D, etc.). - # No views needed since we always return BitArray{N}, not SubArray. - # BitArray.dims is mutable, enabling 0-alloc reuse for same-ndims requests. - nd_arrays::Vector{Any} # Cached BitArray{N} instances - nd_dims::Vector{Any} # Cached dims (NTuple{N,Int}) - nd_ptrs::Vector{UInt} # pointer validation - nd_next_way::Vector{Int} # round-robin counter per slot + # --- N-D Wrapper Cache (setfield!-based reuse) --- + nd_wrappers::Vector{Union{Nothing, Vector{Any}}} # index=N (dimensionality), value=per-slot BitArray{N} # --- State Management (1-based sentinel pattern) --- n_active::Int @@ -341,11 +311,8 @@ end BitTypedPool() = BitTypedPool( # Storage BitVector[], - # 1D BitVector Wrapper Cache (N-way) - Any[], - Any[], - UInt[], - Int[], + # N-D Wrapper Cache + Union{Nothing, Vector{Any}}[], # State Management (1-based sentinel pattern) 0, # n_active [0], # _checkpoint_n_active: sentinel diff --git a/test/legacy/test_nway_cache.jl b/test/legacy/test_nway_cache.jl new file mode 100644 index 00000000..1041c6c6 --- /dev/null +++ b/test/legacy/test_nway_cache.jl @@ -0,0 +1,189 @@ +using Test +using AdaptiveArrayPools + +@testset "N-way Cache for unsafe_acquire!" begin + + @testset "Type checks" begin + pool = AdaptiveArrayPool() + + @with_pool pool begin + # acquire! returns ReshapedArray for N-D + arr = acquire!(pool, Float64, 10, 10) + @test arr isa Base.ReshapedArray{Float64, 2} + + # acquire! returns SubArray for 1D + vec = acquire!(pool, Float64, 100) + @test vec isa SubArray{Float64, 1} + + # unsafe_acquire! returns Array + raw = unsafe_acquire!(pool, Float64, 10, 10) + @test raw isa Array{Float64, 2} + @test raw isa Matrix{Float64} + end + end + + @testset "CACHE_WAYS configuration" begin + # Verify CACHE_WAYS is exported and accessible + @test CACHE_WAYS isa Int + @test 1 <= CACHE_WAYS <= 16 # Valid range + + # Verify set_cache_ways! is exported + @test isdefined(AdaptiveArrayPools, :set_cache_ways!) + end + + @testset "set_cache_ways! validation" begin + # Valid values should return the input value + @test set_cache_ways!(1) == 1 + @test set_cache_ways!(4) == 4 + @test set_cache_ways!(8) == 8 + @test set_cache_ways!(16) == 16 + + # Invalid values should throw ArgumentError + @test_throws ArgumentError set_cache_ways!(0) + @test_throws ArgumentError set_cache_ways!(-1) + @test_throws ArgumentError set_cache_ways!(17) + @test_throws ArgumentError set_cache_ways!(100) + + # Reset to default after tests + set_cache_ways!(4) + end + +end + +@testset "N-way Zero-Allocation" begin + + @testset "N-D unsafe_acquire!: 4-way alternating is zero-alloc" begin + pool = AdaptiveArrayPool() + + function test_nd_4way!(p) + dims_list = ((5, 10), (10, 5), (7, 7), (3, 16)) + for _ in 1:100 + for dims in dims_list + @with_pool p begin + unsafe_acquire!(p, Float64, dims...) + end + end + end + end + + # Warmup + test_nd_4way!(pool) + test_nd_4way!(pool) + + # Measure + allocs = @allocated test_nd_4way!(pool) + allocs > 0 && @warn "N-D 4-way: $allocs bytes (expected 0)" + @test allocs == 0 + end + + @testset "N-D acquire!: 5-way is zero-alloc (ReshapedArray)" begin + # acquire! returns ReshapedArray → no N-way cache needed → always 0 alloc + pool = AdaptiveArrayPool() + + function test_nd_5way_acquire!(p) + dims_list = ((5, 10), (10, 5), (7, 7), (3, 16), (4, 12)) + for _ in 1:100 + for dims in dims_list + @with_pool p begin + acquire!(p, Float64, dims...) # ReshapedArray + end + end + end + end + + # Warmup + test_nd_5way_acquire!(pool) + test_nd_5way_acquire!(pool) + + # acquire! uses reshape(1D_view, dims) → 0 alloc regardless of pattern count + allocs = @allocated test_nd_5way_acquire!(pool) + allocs > 0 && @warn "N-D acquire! 5-way: $allocs bytes (expected 0)" + @test allocs == 0 + end + + @testset "N-D unsafe_acquire!: 5-way behavior" begin + pool = AdaptiveArrayPool() + + function test_nd_5way_unsafe!(p) + dims_list = ((5, 10), (10, 5), (7, 7), (3, 16), (4, 12)) + for _ in 1:100 + for dims in dims_list + @with_pool p begin + unsafe_acquire!(p, Float64, dims...) + end + end + end + end + + # Warmup + test_nd_5way_unsafe!(pool) + test_nd_5way_unsafe!(pool) + + allocs = @allocated test_nd_5way_unsafe!(pool) + # N-way eviction: 5 patterns > CACHE_WAYS=4 + @test allocs > 0 + end + + @testset "Cache invalidation on resize" begin + pool = AdaptiveArrayPool() + + # Warmup with small array + @with_pool pool begin + unsafe_acquire!(pool, Float64, 10, 10) + end + + # Request larger array (forces resize, invalidates cache) + @with_pool pool begin + arr = unsafe_acquire!(pool, Float64, 100, 100) + @test size(arr) == (100, 100) + end + + # Warmup again with new size + @with_pool pool begin + unsafe_acquire!(pool, Float64, 100, 100) + end + + # Now should be zero allocation + allocs = @allocated begin + @with_pool pool begin + unsafe_acquire!(pool, Float64, 100, 100) + end + end + @test allocs == 0 + end + + @testset "Multiple slots with N-way cache" begin + pool = AdaptiveArrayPool() + + # Warmup: each slot gets 2 different shapes + for _ in 1:2 + @with_pool pool begin + unsafe_acquire!(pool, Float64, 5, 5) # Slot 1 + unsafe_acquire!(pool, Float64, 10, 10) # Slot 2 + end + @with_pool pool begin + unsafe_acquire!(pool, Float64, 6, 6) # Slot 1, different dims + unsafe_acquire!(pool, Float64, 12, 12) # Slot 2, different dims + end + end + + # Both slots should have their shapes cached + allocs = @allocated begin + @with_pool pool begin + unsafe_acquire!(pool, Float64, 5, 5) + unsafe_acquire!(pool, Float64, 10, 10) + end + end + @test allocs == 0 + + # Alternating dims should also hit cache + allocs = @allocated begin + @with_pool pool begin + unsafe_acquire!(pool, Float64, 6, 6) + unsafe_acquire!(pool, Float64, 12, 12) + end + end + @test allocs == 0 + end + +end diff --git a/test/runtests.jl b/test/runtests.jl index f8cc5f2e..a0d0459c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,14 @@ using Test using AdaptiveArrayPools using AdaptiveArrayPools: get_typed_pool! -import AdaptiveArrayPools: checkpoint!, rewind! +import AdaptiveArrayPools: checkpoint!, rewind! + +# Version-specific helpers (always defined, even for ARGS path) +@static if VERSION >= v"1.11-" + _test_nd_cache_preserved(tp) = !isempty(tp.nd_wrappers) +else + _test_nd_cache_preserved(tp) = length(tp.nd_arrays) >= 1 +end # Check if specific test files are requested via ARGS if !isempty(ARGS) @@ -10,26 +17,50 @@ if !isempty(ARGS) include(testfile) end else - include("test_aqua.jl") - include("test_basic.jl") - include("test_state.jl") - include("test_multidimensional.jl") - include("test_macros.jl") - include("test_task_local_pool.jl") - include("test_utils.jl") - include("test_macro_expansion.jl") - include("test_macro_internals.jl") - include("test_zero_allocation.jl") - include("test_disabled_pooling.jl") - include("test_aliases.jl") - include("test_nway_cache.jl") - include("test_fixed_slots.jl") - include("test_backend_macro_expansion.jl") - include("test_convenience.jl") - include("test_bitarray.jl") - include("test_coverage.jl") - include("test_allocation.jl") - include("test_fallback_reclamation.jl") + # Version-specific test file selection + @static if VERSION >= v"1.11-" + include("test_aqua.jl") + include("test_basic.jl") + include("test_state.jl") + include("test_multidimensional.jl") + include("test_macros.jl") + include("test_task_local_pool.jl") + include("test_utils.jl") + include("test_macro_expansion.jl") + include("test_macro_internals.jl") + include("test_zero_allocation.jl") + include("test_disabled_pooling.jl") + include("test_aliases.jl") + include("test_nway_cache.jl") + include("test_fixed_slots.jl") + include("test_backend_macro_expansion.jl") + include("test_convenience.jl") + include("test_bitarray.jl") + include("test_coverage.jl") + include("test_allocation.jl") + include("test_fallback_reclamation.jl") + else + include("test_aqua.jl") + include("test_basic.jl") + include("test_state.jl") + include("test_multidimensional.jl") + include("test_macros.jl") + include("test_task_local_pool.jl") + include("test_utils.jl") + include("test_macro_expansion.jl") + include("test_macro_internals.jl") + include("test_zero_allocation.jl") + include("test_disabled_pooling.jl") + include("test_aliases.jl") + include("legacy/test_nway_cache.jl") + include("test_fixed_slots.jl") + include("test_backend_macro_expansion.jl") + include("test_convenience.jl") + include("test_bitarray.jl") + include("test_coverage.jl") + include("test_allocation.jl") + include("test_fallback_reclamation.jl") + end # CUDA extension tests (auto-detect, skip with TEST_CUDA=false) if get(ENV, "TEST_CUDA", "true") != "false" diff --git a/test/test_nway_cache.jl b/test/test_nway_cache.jl index c5051e74..cd1c2c76 100644 --- a/test/test_nway_cache.jl +++ b/test/test_nway_cache.jl @@ -101,8 +101,7 @@ end @test allocs == 0 end - @testset "N-D unsafe_acquire!: 5-way causes allocation (cache eviction)" begin - # unsafe_acquire! uses N-way cache → 5-way exceeds CACHE_WAYS=4 + @testset "N-D unsafe_acquire!: 5-way behavior" begin pool = AdaptiveArrayPool() function test_nd_5way_unsafe!(p) @@ -110,19 +109,45 @@ end for _ in 1:100 for dims in dims_list @with_pool p begin - unsafe_acquire!(p, Float64, dims...) # Array with cache + unsafe_acquire!(p, Float64, dims...) end end end end - # Warmup (fills cache with 4 patterns, 5th evicts one) + # Warmup test_nd_5way_unsafe!(pool) test_nd_5way_unsafe!(pool) - # 5-way exceeds 4-way cache → eviction → unsafe_wrap allocation allocs = @allocated test_nd_5way_unsafe!(pool) - @test allocs > 0 + # setfield! reuse: unlimited dim patterns, 0-alloc + allocs > 0 && @warn "N-D 5-way unsafe: $allocs bytes (expected 0)" + @test allocs == 0 + end + + @testset "N-D unsafe_acquire!: 10+ patterns per slot is zero-alloc" begin + # Demonstrates removal of CACHE_WAYS limit via setfield! (Julia 1.11+) + pool = AdaptiveArrayPool() + + function test_nd_many_patterns!(p) + dims_list = ((2, 50), (5, 20), (10, 10), (20, 5), (50, 2), + (1, 100), (100, 1), (4, 25), (25, 4), (8, 13)) + for _ in 1:50 + for dims in dims_list + @with_pool p begin + unsafe_acquire!(p, Float64, dims...) + end + end + end + end + + # Warmup + test_nd_many_patterns!(pool) + test_nd_many_patterns!(pool) + + allocs = @allocated test_nd_many_patterns!(pool) + allocs > 0 && @warn "N-D 10+ patterns: $allocs bytes (expected 0)" + @test allocs == 0 end @testset "Cache invalidation on resize" begin @@ -188,3 +213,179 @@ end end end + +# ============================================================================== +# Vector-Based N-D Wrapper Cache Tests (Julia 1.11+) +# ============================================================================== +# These tests verify the Dict→Vector migration for nd_wrappers. + +@testset "Vector-based nd_wrappers cache" begin + using AdaptiveArrayPools: checkpoint!, rewind! + + @testset "nd_wrappers grows correctly for multiple dimensionalities" begin + pool = AdaptiveArrayPool() + checkpoint!(pool) + + # N=1: 1D unsafe_acquire + v1 = unsafe_acquire!(pool, Float64, 10) + @test length(pool.float64.nd_wrappers) >= 1 + + # N=2: 2D unsafe_acquire — nd_wrappers should grow to index 2 + m1 = unsafe_acquire!(pool, Float64, 3, 4) + @test length(pool.float64.nd_wrappers) >= 2 + @test pool.float64.nd_wrappers[2] !== nothing # has a Vector{Any} for N=2 + + # N=3: 3D unsafe_acquire — nd_wrappers should grow to index 3 + t1 = unsafe_acquire!(pool, Float64, 2, 3, 4) + @test length(pool.float64.nd_wrappers) >= 3 + @test pool.float64.nd_wrappers[3] !== nothing # has a Vector{Any} for N=3 + + rewind!(pool) + end + + @testset "wrapper object identity is preserved on cache hit" begin + pool = AdaptiveArrayPool() + + # First call: cache miss → creates wrapper + checkpoint!(pool) + m1 = unsafe_acquire!(pool, Float64, 3, 4) + wrapper_id = objectid(m1) + rewind!(pool) + + # Second call: cache hit → same wrapper object, updated fields + checkpoint!(pool) + m2 = unsafe_acquire!(pool, Float64, 5, 6) + @test objectid(m2) == wrapper_id # same Array object reused via setfield! + @test size(m2) == (5, 6) # dims updated in-place + rewind!(pool) + + # Third call with same dims: still same wrapper + checkpoint!(pool) + m3 = unsafe_acquire!(pool, Float64, 5, 6) + @test objectid(m3) == wrapper_id + @test size(m3) == (5, 6) + rewind!(pool) + end + + @testset "different N values use independent wrapper slots" begin + pool = AdaptiveArrayPool() + checkpoint!(pool) + + # Slot 1 as 2D + m = unsafe_acquire!(pool, Float64, 3, 4) + rewind!(pool) + + checkpoint!(pool) + # Slot 1 as 3D — different N, must create new wrapper + t = unsafe_acquire!(pool, Float64, 2, 3, 4) + @test size(t) == (2, 3, 4) + + # Both N=2 and N=3 entries exist + @test pool.float64.nd_wrappers[2] !== nothing + @test pool.float64.nd_wrappers[3] !== nothing + rewind!(pool) + end + + @testset "nd_wrappers with nothing gaps for skipped N" begin + pool = AdaptiveArrayPool() + checkpoint!(pool) + + # Jump directly to N=3 without using N=2 + t = unsafe_acquire!(pool, Float64, 2, 3, 4) + @test length(pool.float64.nd_wrappers) >= 3 + + # N=2 entry should be nothing (never used for N=2) + @test pool.float64.nd_wrappers[2] === nothing + + rewind!(pool) + end + + @testset "BitTypedPool nd_wrappers cache" begin + pool = AdaptiveArrayPool() + checkpoint!(pool) + + # 1D BitArray + bv = acquire!(pool, Bit, 100) + @test length(pool.bits.nd_wrappers) >= 1 + + # 2D BitArray + ba = acquire!(pool, Bit, 10, 10) + @test length(pool.bits.nd_wrappers) >= 2 + @test pool.bits.nd_wrappers[2] !== nothing + + rewind!(pool) + + # Verify wrapper reuse for BitArray + checkpoint!(pool) + bv2 = acquire!(pool, Bit, 50) + ba2 = acquire!(pool, Bit, 5, 20) + @test size(ba2) == (5, 20) + rewind!(pool) + end + + @testset "empty! clears nd_wrappers" begin + pool = AdaptiveArrayPool() + checkpoint!(pool) + unsafe_acquire!(pool, Float64, 3, 4) + rewind!(pool) + + @test !isempty(pool.float64.nd_wrappers) + empty!(pool) + @test isempty(pool.float64.nd_wrappers) + end + + @testset "multiple element types have independent nd_wrappers" begin + pool = AdaptiveArrayPool() + checkpoint!(pool) + + mf = unsafe_acquire!(pool, Float64, 3, 4) + mi = unsafe_acquire!(pool, Int64, 5, 6) + + @test pool.float64.nd_wrappers[2] !== nothing + @test pool.int64.nd_wrappers[2] !== nothing + + # They must be separate Vector{Any} instances + @test pool.float64.nd_wrappers[2] !== pool.int64.nd_wrappers[2] + + rewind!(pool) + end + + # Function barrier for accurate allocation measurement + function test_mixed_nd_zero_alloc() + pool = AdaptiveArrayPool() + + # Warmup: exercise N=1, N=2, N=3 for same slot + for _ in 1:2 + @with_pool pool begin + unsafe_acquire!(pool, Float64, 100) + end + @with_pool pool begin + unsafe_acquire!(pool, Float64, 10, 10) + end + @with_pool pool begin + unsafe_acquire!(pool, Float64, 5, 4, 5) + end + end + + # Measure: all three should be cache hits + a1 = @allocated @with_pool pool begin + unsafe_acquire!(pool, Float64, 50) + end + a2 = @allocated @with_pool pool begin + unsafe_acquire!(pool, Float64, 7, 7) + end + a3 = @allocated @with_pool pool begin + unsafe_acquire!(pool, Float64, 3, 3, 3) + end + return (a1, a2, a3) + end + + @testset "mixed dimensionalities zero-alloc after warmup" begin + test_mixed_nd_zero_alloc() + test_mixed_nd_zero_alloc() + a1, a2, a3 = test_mixed_nd_zero_alloc() + @test a1 == 0 + @test a2 == 0 + @test a3 == 0 + end +end diff --git a/test/test_state.jl b/test/test_state.jl index a0f67b31..4254c87d 100644 --- a/test/test_state.jl +++ b/test/test_state.jl @@ -287,7 +287,7 @@ import AdaptiveArrayPools: _typed_lazy_checkpoint!, _typed_lazy_rewind!, _tracke @test pool.float64.n_active == 0 @test length(pool.float64.vectors) >= 3 # Vectors preserved @test length(pool.float64.views) >= 1 # 1D cache preserved - @test length(pool.float64.nd_arrays) >= 1 # N-D cache preserved + @test _test_nd_cache_preserved(pool.float64) # N-D cache preserved end @testset "reset! restores checkpoint stacks to sentinel" begin