ProjectTorreyPines · mgyoo86 · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026
diff --git a/Project.toml b/Project.toml
@@ -9,12 +9,15 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 
 [weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 
 [extensions]
 AdaptiveArrayPoolsCUDAExt = "CUDA"
+AdaptiveArrayPoolsMetalExt = "Metal"
 
 [compat]
 CUDA = "5"
+Metal = "1"
 Preferences = "1"
 Printf = "1"
 julia = "1.10"
diff --git a/README.md b/README.md
@@ -14,6 +14,7 @@ A lightweight library that lets you write natural, allocation-style code while a
 **Supported backends:**
 - **CPU** — `Array`, works out of the box
 - **CUDA** — `CuArray`, loads automatically when [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) is available
+- **Metal** — `MtlArray`, loads automatically when [Metal.jl](https://github.com/JuliaGPU/Metal.jl) is available (Apple Silicon)
 
 ## The Problem
 
@@ -68,7 +69,7 @@ end
 | Allocations | ⚠️ 90,000 (2.75 GiB) | ✅ **0** | 100% eliminated |
 | GC Time | ⚠️ 31% | ✅ **0%** | No GC pauses |
 
-> **CUDA support**: Same API—just use `@with_pool :cuda pool`. See [CUDA Backend](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/cuda).
+> **GPU support**: Same API—just use `@with_pool :cuda pool` or `@with_pool :metal pool`. See [CUDA Backend](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/features/cuda-support) and [Metal Backend](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/features/metal-support).
 
 ## How It Works
 
@@ -111,11 +112,12 @@ Pkg.add("AdaptiveArrayPools")
 
 | Guide | Description |
 |-------|-------------|
-| [API Reference](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/api) | Complete function and macro reference |
-| [CUDA Backend](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/cuda) | GPU-specific usage and examples |
-| [Safety Guide](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/guide/safety) | Scope rules and best practices |
-| [Multi-Threading](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/advanced/multi-threading) | Task/thread safety patterns |
-| [Configuration](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/configuration) | Preferences and cache tuning |
+| [API Reference](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/reference/api) | Complete function and macro reference |
+| [CUDA Backend](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/features/cuda-support) | NVIDIA GPU usage and examples |
+| [Metal Backend](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/features/metal-support) | Apple Silicon GPU usage and examples |
+| [Safety Guide](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/basics/safety-rules) | Scope rules and best practices |
+| [Multi-Threading](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/features/multi-threading) | Task/thread safety patterns |
+| [Configuration](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/features/configuration) | Preferences and cache tuning |
 
 ## License
 

diff --git a/docs/make.jl b/docs/make.jl
@@ -33,6 +33,7 @@ const README_PATH_MAPPINGS = [
     (r"\(docs/configuration\.md(#[^)]+)?\)", s"(features/configuration.md\1)"),
     (r"\(docs/maybe_with_pool\.md(#[^)]+)?\)", s"(features/maybe-with-pool.md\1)"),
     (r"\(docs/multi-threading\.md(#[^)]+)?\)", s"(features/multi-threading.md\1)"),
+    (r"\(docs/metal\.md(#[^)]+)?\)", s"(features/metal-support.md\1)"),
 
     # Basics
     (r"\(docs/safety\.md(#[^)]+)?\)", s"(basics/safety-rules.md\1)"),
@@ -130,6 +131,7 @@ makedocs(
             "`@maybe_with_pool`" => "features/maybe-with-pool.md",
             "Bit Arrays" => "features/bit-arrays.md",
             "CUDA Support" => "features/cuda-support.md",
+            "Metal Support" => "features/metal-support.md",
             "Configuration" => "features/configuration.md",
         ],
         "Reference" => [

diff --git a/docs/src/features/cuda-support.md b/docs/src/features/cuda-support.md
@@ -48,24 +48,10 @@ The CUDA backend uses the same API as CPU, with `:cuda` backend specifier:
 **GPU Memory**: Always 0 bytes allocation after warmup. The underlying `CuVector` is resized as needed and reused.
 
 **CPU-side Wrapper Memory** (for `acquire!` N-D on CUDA):
-- The CUDA backend uses an N-way set-associative cache for `CuArray` wrapper reuse
-- Cache hit (≤`CACHE_WAYS` dimension patterns per slot): 0 bytes
-- Cache miss (>`CACHE_WAYS` patterns): ~100 bytes for wrapper metadata
-- See [Configuration](configuration.md) for `CACHE_WAYS` tuning
-
-!!! note "CPU vs CUDA caching"
-    On CPU (Julia 1.11+), `acquire!` uses `setfield!`-based wrapper reuse with **zero allocation for any number of dimension patterns**. The CUDA backend does not yet support this optimization and still uses the N-way cache.
-
-```julia
-# Example: 4 patterns fit in default 4-way cache → zero CPU-side allocation
-dims_list = ((10, 10), (5, 20), (20, 5), (4, 25))
-for dims in dims_list
-    @with_pool :cuda p begin
-        A = acquire!(p, Float64, dims...)
-        # Use A...
-    end
-end
-```
+- The CUDA backend uses `arr_wrappers`-based direct-index caching for `CuArray` wrapper reuse
+- Each dimensionality `N` has one cached wrapper per slot, reused via `setfield!(:dims)`
+- After warmup: **zero CPU-side allocation for any number of dimension patterns** (same `N`)
+- Different `N` values each get their own cached wrapper (also zero-alloc after first use)
 
 ## Fixed Slot Types
 
@@ -86,6 +72,7 @@ Other types use the fallback dictionary (`.others`).
 
 ## Limitations
 
+- **Julia 1.11+**: Required for `setfield!`-based Array internals used by GPU extensions
 - **No `@maybe_with_pool :cuda`**: Runtime toggle not supported for CUDA backend
 - **Task-local only**: Each Task gets its own CUDA pool, same as CPU
 - **Same device**: All arrays in a pool use the same CUDA device

diff --git a/docs/src/features/metal-support.md b/docs/src/features/metal-support.md
@@ -0,0 +1,129 @@
+# Metal Backend
+
+AdaptiveArrayPools provides native Apple Silicon GPU support through a package extension that loads automatically when [Metal.jl](https://github.com/JuliaGPU/Metal.jl) is available. Requires Julia 1.11+.
+
+## Quick Start
+
+```julia
+using AdaptiveArrayPools, Metal
+
+# Use :metal backend for Apple Silicon GPU arrays
+@with_pool :metal pool function gpu_computation(n)
+    A = acquire!(pool, Float32, n, n)  # MtlArray
+    B = acquire!(pool, Float32, n, n)  # MtlArray
+
+    fill!(A, 1.0f0)
+    fill!(B, 2.0f0)
+
+    return sum(A .+ B)
+end
+
+# Zero GPU allocation in hot loops
+for i in 1:1000
+    gpu_computation(100)  # GPU memory reused from pool
+end
+```
+
+## API
+
+The Metal backend uses the same API as CPU and CUDA, with `:metal` backend specifier:
+
+| Macro/Function | Description |
+|----------------|-------------|
+| `@with_pool :metal pool expr` | GPU pool with automatic checkpoint/rewind |
+| `acquire!(pool, T, dims...)` | Returns `MtlArray` (always 0 bytes GPU alloc) |
+| `acquire_view!(pool, T, dims...)` | Returns `MtlArray` (same as `acquire!` on Metal) |
+| `get_task_local_metal_pool()` | Returns the task-local Metal pool |
+| `pool_stats(:metal)` | Print Metal pool statistics |
+
+## Return Types
+
+| Function | 1D Return | N-D Return |
+|----------|-----------|------------|
+| `acquire!` | `MtlArray{T,1}` | `MtlArray{T,N}` |
+| `acquire_view!` | `MtlArray{T,1}` | `MtlArray{T,N}` |
+
+## Allocation Behavior
+
+**GPU Memory**: Always 0 bytes allocation after warmup. The underlying `MtlVector` is resized as needed and reused.
+
+**CPU-side Wrapper Memory** (for `acquire!` N-D on Metal):
+- The Metal backend uses `arr_wrappers`-based direct-index caching for `MtlArray` wrapper reuse
+- Each dimensionality `N` has one cached wrapper per slot, reused via `setfield!(:dims)`
+- After warmup: **zero CPU-side allocation for any number of dimension patterns** (same `N`)
+- Different `N` values each get their own cached wrapper (also zero-alloc after first use)
+
+## Fixed Slot Types
+
+Metal hardware does not support Float64 or ComplexF64. The following types have optimized pre-allocated slots:
+
+| Type | Field |
+|------|-------|
+| `Float32` | `.float32` |
+| `Float16` | `.float16` |
+| `Int64` | `.int64` |
+| `Int32` | `.int32` |
+| `ComplexF32` | `.complexf32` |
+| `Bool` | `.bool` |
+
+Other types use the fallback dictionary (`.others`).
+
+!!! note "No Float64/ComplexF64"
+    Apple Silicon GPUs do not natively support 64-bit floating point. Use `Float32` or `Float16` instead.
+
+## Limitations
+
+- **No Float64/ComplexF64**: Apple Silicon GPUs do not natively support 64-bit floating point
+- **No `@maybe_with_pool :metal`**: Runtime toggle not supported for Metal backend
+- **Single-device only**: Tested on single Apple GPU (multi-device untested)
+- **Julia 1.11+**: Required for `setfield!`-based Array internals used by GPU extensions
+- **Task-local only**: Each Task gets its own Metal pool, same as CPU
+
+## Example: Matrix Computation
+
+```julia
+using AdaptiveArrayPools, Metal
+
+@with_pool :metal pool function gpu_compute(n)
+    A = acquire!(pool, Float32, n, n)
+    B = acquire!(pool, Float32, n, n)
+    C = acquire!(pool, Float32, n, n)
+
+    fill!(A, 1.0f0); fill!(B, 2.0f0)
+    C .= A .+ B
+
+    return sum(C)
+end
+
+# Warmup
+gpu_compute(100)
+
+# Benchmark - zero GPU allocation
+using BenchmarkTools
+@benchmark gpu_compute(1000)
+```
+
+## Debugging
+
+```julia
+# Check pool state
+pool_stats(:metal)
+
+# Output:
+# MetalAdaptiveArrayPool
+#   Float32 (fixed) [Metal]
+#     slots: 3 (active: 0)
+#     elements: 30000 (117.188 KiB)
+```
+
+## CUDA vs Metal
+
+| Feature | CUDA | Metal |
+|---------|------|-------|
+| Backend symbol | `:cuda` | `:metal` |
+| Array type | `CuArray` | `MtlArray` |
+| Float64 support | Yes | No |
+| ComplexF64 support | Yes | No |
+| Julia requirement | 1.11+ | 1.11+ |
+| Safety features | Full | Full |
+| Lazy mode | Yes | Yes |
diff --git a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl
@@ -9,39 +9,48 @@ Loaded automatically when `using CUDA` with AdaptiveArrayPools.
 module AdaptiveArrayPoolsCUDAExt
 
 using AdaptiveArrayPools
-using AdaptiveArrayPools: AbstractTypedPool, AbstractArrayPool
 using CUDA
 
-# Type definitions
-include("types.jl")
+# GPU pooling requires Julia 1.11+ (setfield!-based Array, arr_wrappers cache).
+# On older Julia, the extension loads but provides no functionality.
+@static if VERSION >= v"1.11-"
 
-# Dispatch methods (allocate_vector, wrap_array, get_typed_pool!)
-include("dispatch.jl")
+    using AdaptiveArrayPools: AbstractTypedPool, AbstractArrayPool
 
-# GPU-specific acquire (arr_wrappers + setfield!, _resize_to_fit!, _reshape_impl!)
-include("acquire.jl")
+    # Type definitions
+    include("types.jl")
 
-# Task-local pool (multi-device aware)
-include("task_local_pool.jl")
+    # Dispatch methods (allocate_vector, wrap_array, get_typed_pool!)
+    include("dispatch.jl")
 
-# State management (checkpoint!, rewind!, reset!, empty!)
-include("state.jl")
+    # GPU-specific acquire (arr_wrappers + setfield!, _resize_to_fit!, _reshape_impl!)
+    include("acquire.jl")
 
-# Safety: poisoning, escape detection, borrow tracking
-include("debug.jl")
+    # Task-local pool (multi-device aware)
+    include("task_local_pool.jl")
 
-# Display & statistics (pool_stats, show)
-include("utils.jl")
+    # State management (checkpoint!, rewind!, reset!, empty!)
+    include("state.jl")
 
-# Macro support (@with_pool :cuda)
-include("macros.jl")
+    # Safety: poisoning, escape detection, borrow tracking
+    include("debug.jl")
 
-# Convenience functions (Float32 default for zeros!/ones!)
-include("convenience.jl")
+    # Display & statistics (pool_stats, show)
+    include("utils.jl")
 
-# Exports (types only - functions are exported from main module)
-export CuTypedPool, CuAdaptiveArrayPool
-export GPU_FIXED_SLOT_FIELDS
-# get_task_local_cuda_pool, get_task_local_cuda_pools are exported from AdaptiveArrayPools
+    # Macro support (@with_pool :cuda)
+    include("macros.jl")
+
+    # Convenience functions (Float32 default for zeros!/ones!)
+    include("convenience.jl")
+
+    # Exports (types only - functions are exported from main module)
+    export CuTypedPool, CuAdaptiveArrayPool
+    export GPU_FIXED_SLOT_FIELDS
+    # get_task_local_cuda_pool, get_task_local_cuda_pools are exported from AdaptiveArrayPools
+
+else
+    @warn "AdaptiveArrayPoolsCUDAExt requires Julia 1.11+. GPU pooling is disabled." maxlog = 1
+end # @static if
 
 end # module
diff --git a/ext/AdaptiveArrayPoolsMetalExt/AdaptiveArrayPoolsMetalExt.jl b/ext/AdaptiveArrayPoolsMetalExt/AdaptiveArrayPoolsMetalExt.jl
@@ -0,0 +1,41 @@
+"""
+    AdaptiveArrayPoolsMetalExt
+
+Metal extension for AdaptiveArrayPools.jl. Provides GPU memory pooling
+with the same checkpoint/rewind semantics as CPU pools.
+
+Loaded automatically when `using Metal` with AdaptiveArrayPools.
+
+Supports Metal.PrivateStorage only. Default element type is Float32.
+Explicitly unsupported: Float64, ComplexF64.
+"""
+module AdaptiveArrayPoolsMetalExt
+
+using AdaptiveArrayPools
+using Metal
+
+# GPU pooling requires Julia 1.11+ (setfield!-based Array, arr_wrappers cache).
+# On older Julia, the extension loads but provides no functionality.
+@static if VERSION >= v"1.11-"
+
+    using AdaptiveArrayPools: AbstractTypedPool, AbstractArrayPool
+    using Metal.GPUArrays
+
+    include("types.jl")
+    include("dispatch.jl")
+    include("acquire.jl")
+    include("task_local_pool.jl")
+    include("state.jl")
+    include("debug.jl")
+    include("utils.jl")
+    include("macros.jl")
+    include("convenience.jl")
+
+    export MetalTypedPool, MetalAdaptiveArrayPool
+    export METAL_FIXED_SLOT_FIELDS
+
+else
+    @warn "AdaptiveArrayPoolsMetalExt requires Julia 1.11+. GPU pooling is disabled." maxlog = 1
+end # @static if
+
+end # module