diff --git a/.gitignore b/.gitignore
index 62dc5dc..29075b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,18 @@ CMakeUserPresets.json
 # Test outputs
 Testing/
 *.log
+
+# Benchmark data (large graph files downloaded separately — see benchmark/data/README.md)
+benchmark/data/*.txt
+benchmark/data/*.gz
+
+# VTune profiler raw collections + CSV exports
+build/vtune/
+vtune/
+
+# Perf-tooling artifacts (bench JSON, disassembly captures, hotspot CSVs)
+artifacts/
+
+# Python bytecode caches
+__pycache__/
+*.pyc
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1237f32..7b2acb8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,9 @@
 ## [Unreleased]
 
 ### Added
+- **Indexed d-ary heap for Dijkstra** (`detail/indexed_dary_heap.hpp`) — opt-in O(V)-bounded heap with true decrease-key, parameterized by arity. Selected via the new `use_indexed_dary_heap<Arity>` heap-selector tag on `dijkstra_shortest_paths` / `dijkstra_shortest_distances`. Supports both dense graphs (via `vector_position_map`) and mapped / hashable-vertex-id graphs (via `assoc_position_map`).
+- `use_default_heap` and `use_indexed_dary_heap<Arity>` heap-selector tags. **`use_default_heap` remains the default** — it wins on grid (E/V≈4) and path (E/V=1) workloads. Use `use_indexed_dary_heap<8>` for high-E/V random / scale-free graphs on `compressed_graph`, where Phase 4 benchmarks measured −25% (Erdős–Rényi) and −17% (Barabási–Albert) at 100K vertices vs. the default. See `agents/indexed_dary_heap_results.md` for full numbers.
+- `vector_position_map` / `assoc_position_map` adapters (`detail/heap_position_map.hpp`) used by the indexed heap.
 - **Tarjan's SCC algorithm** (`tarjan_scc.hpp`) — single-pass O(V+E) strongly connected components using iterative DFS with low-link values; no transpose graph needed
 - 17 new Tarjan SCC tests (`test_tarjan_scc.cpp`)
 - **Mapped (sparse) graph algorithm support** — all 14 algorithms now accept `adjacency_list<G>` (both index and map-based containers)
diff --git a/CMakePresets.json b/CMakePresets.json
index fd22838..fae0f9e 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -27,6 +27,10 @@
         "lhs": "${hostSystemName}",
         "rhs": "Windows"
       },
+      "architecture": {
+        "value": "x64",
+        "strategy": "external"
+      },
       "cacheVariables": {
         "CMAKE_C_COMPILER": "cl.exe",
         "CMAKE_CXX_COMPILER": "cl.exe"
@@ -81,18 +85,19 @@
     {
       "name": "windows-msvc-release",
       "displayName": "Windows MSVC Release",
-      "description": "Windows development build with MSVC (Release)",
+      "description": "Windows development build with MSVC (Release; default flags /O2 /Ob2 /DNDEBUG)",
       "inherits": "windows-base",
       "cacheVariables": {
         "CMAKE_BUILD_TYPE": "Release",
         "BUILD_TESTS": "ON",
-        "BUILD_EXAMPLES": "ON"
+        "BUILD_EXAMPLES": "ON",
+        "BUILD_BENCHMARKS": "ON"
       }
     },
     {
       "name": "windows-msvc-relwithdebinfo",
       "displayName": "Windows MSVC RelWithDebInfo",
-      "description": "Windows optimized build with debug info",
+      "description": "Windows optimized build with debug info (default MSVC RelWithDebInfo flags: /O2 /Ob1 /Zi /DNDEBUG)",
       "inherits": "windows-base",
       "cacheVariables": {
         "CMAKE_BUILD_TYPE": "RelWithDebInfo",
@@ -100,6 +105,18 @@
         "BUILD_EXAMPLES": "ON"
       }
     },
+    {
+      "name": "windows-msvc-profile",
+      "displayName": "Windows MSVC Profile (release codegen + /Ob3 + PDB)",
+      "description": "Investigation-only build: /O2 /Ob3 /Zi /DNDEBUG + /DEBUG linker. Maximum inlining (/Ob3) so VTune sees the inlined hot path; PDB so symbols and source attribution work. Do NOT use for production timing comparisons \u2014 use windows-msvc-release for those.",
+      "inherits": "windows-msvc-release",
+      "cacheVariables": {
+        "CMAKE_CXX_FLAGS_RELEASE": "/O2 /Ob3 /Zi /DNDEBUG",
+        "CMAKE_EXE_LINKER_FLAGS_RELEASE": "/DEBUG",
+        "CMAKE_SHARED_LINKER_FLAGS_RELEASE": "/DEBUG",
+        "DIJKSTRA_BENCH_BGL": "ON"
+      }
+    },
     {
       "name": "windows-clang-debug",
       "displayName": "Windows Clang Debug",
diff --git a/agents/doc_revision_plan.md b/agents/archive/doc_revision_plan.md
similarity index 100%
rename from agents/doc_revision_plan.md
rename to agents/archive/doc_revision_plan.md
diff --git a/agents/index_vertex_descriptor_plan.md b/agents/archive/index_vertex_descriptor_plan.md
similarity index 100%
rename from agents/index_vertex_descriptor_plan.md
rename to agents/archive/index_vertex_descriptor_plan.md
diff --git a/agents/map_container_plan.md b/agents/archive/map_container_plan.md
similarity index 100%
rename from agents/map_container_plan.md
rename to agents/archive/map_container_plan.md
diff --git a/agents/map_container_strategy.md b/agents/archive/map_container_strategy.md
similarity index 100%
rename from agents/map_container_strategy.md
rename to agents/archive/map_container_strategy.md
diff --git a/agents/dary_heap/csr_edge_value_perf_plan.md b/agents/dary_heap/csr_edge_value_perf_plan.md
new file mode 100644
index 0000000..89a0f4e
--- /dev/null
+++ b/agents/dary_heap/csr_edge_value_perf_plan.md
@@ -0,0 +1,578 @@
+# `compressed_graph` edge-value access path — performance investigation plan
+
+## Background
+
+Phase 4.3a (see `indexed_dary_heap_results.md`) measured graph-v3 +
+`use_indexed_dary_heap<D>` against BGL `compressed_sparse_row_graph` +
+`d_ary_heap_indirect<…, 4, …>` on identical 100 K-vertex graphs:
+
+| Topology | Idx4 vs BGL | Idx8 vs BGL |
+|---|---:|---:|
+| ER Sparse | +7.7% | +5.9% |
+| BA        | +9.4% | +4.6% |
+| Grid      | +36.5% | +38.5% |
+| Path      | +15.0% | +14.6% |
+
+Open Question 6 ruled out arity (Idx4/Idx8 within 1–3 pp) and the heap (gap is
+*largest* on Grid, the topology with the most predictable heap-access pattern).
+Open Question 3 confirmed `compare_(distance_(...))` inlines to a single
+`ucomisd` against a direct base+idx*8 load. The remaining gap therefore lives
+in the **relax loop's edge-value access path** — i.e. between
+`for (auto&& [vid, uv] : views::incidence(g, u))` and the load of
+`edge_value(g, uv)` inside the user's `WeightFn` lambda.
+
+This document plans an investigation with no implementation commitments yet.
+
+### Side observation — MSVC vs GCC, same machine (2026-04-26)
+
+`agents/indexed_dary_heap_baseline_msvc.md` captures the same `benchmark_dijkstra`
+suite under MSVC 19.50 (Visual Studio 18.5.1, x64 Release) on the same Titania
+host. Most CSR rows agree with the GCC numbers within ±10 %, with one striking
+exception:
+
+| Topology @ 100K | Heap | GCC ns | MSVC ns | MSVC ÷ GCC |
+|---|---|---:|---:|---:|
+| Path | Default | 268,708 | 1,331,743 | **4.96×** |
+| Path | Idx4 | 326,018 | 498,438 | 1.53× |
+| Path | Idx8 | 327,820 | 491,302 | 1.50× |
+
+MSVC's `std::priority_queue` codegen is **~5×** slower than libstdc++'s on the
+Path workload (no decrease-key, single-source linear chain). Switching to the
+indexed heap collapses the toolchain gap to ~1.5×, so the slowdown is in
+MSVC's heap implementation, not in graph-v3's CPO/visitor scaffolding.
+
+This is **not** the gap this plan is trying to close (we're chasing the
+graph-v3 vs BGL CSR gap on a single toolchain), but it is worth flagging:
+any VTune profile run under this plan must be compared against MSVC numbers
+for the same toolchain — never cross-compared against the Linux/GCC baseline
+in `indexed_dary_heap_baseline.md`. The MSVC baseline is the anchor for
+Phase 1 (Windows) below.
+
+---
+
+## What the access path actually does today
+
+For `compressed_graph<double, void, void, uint32_t, uint32_t>`:
+
+**Storage** (`include/graph/container/compressed_graph.hpp`)
+- `row_index_`  : `vector<row_entry>`  — one entry per vertex, `.index` is offset into `col_index_`.
+- `col_index_`  : `vector<col_entry>`  — one entry per edge, `.index` is `vertex_id_t` of target.
+- `edge_value_` : `vector<EV>`         — one entry per edge, parallel to `col_index_`, stored in a *separate buffer*.
+
+**Per-edge work in the Dijkstra inner loop** (`dijkstra_shortest_paths.hpp:460`):
+```cpp
+for (auto&& [vid, uv] : views::incidence(g, u)) {
+  const auto w = weight_fn(g, uv);   // → edge_value(g, uv)
+  // → g.edge_value(uv.value() - g.col_index_.begin())
+  // → edge_value_[k]
+  ...
+  relax_target(uv, uid);             // reads target_id(g, uv) = uv.value()->index → col_index_[k].index
+}
+```
+
+So the inner loop touches two parallel arrays per edge:
+
+| Load | Source | Cache-line cost (typical) |
+|---|---|---|
+| `vid` (target id, u32) | `col_index_[k].index`     | 1 line per 16 edges |
+| `w`   (weight, f64)    | `edge_value_[k]`          | 1 line per 8 edges  |
+| `distance[vid]` (f64)  | distance buffer (random)  | 1 line per visit (random access) |
+
+BGL's `compressed_sparse_row_graph` with a bundled property and
+`get(&prop::weight, g)` typically resolves to a raw `Weight*` of length
+`num_edges()`. The data layout is therefore comparable — *but* BGL's
+adjacency arrays are also stored in distinct buffers. The dense-graph win
+suggests the per-edge work, not the layout, is what differs.
+
+Other suspects to investigate, ordered by prior probability:
+
+1. **Iterator descriptor materialisation.** `views::incidence` yields a
+   structured `[vid, uv]` pair where `uv` is a full `edge_descriptor`
+   carrying the source vertex (for `target_id` symmetry), an iterator into
+   `col_index_`, possibly a graph back-reference. The relax loop only needs
+   `target_id` (already in `vid`) and `edge_value`; `uv` is also re-passed to
+   `weight_fn`, where `edge_value(g, uv)` recomputes
+   `uv.value() - g.col_index_.begin()` — a pointer subtraction the iterator
+   itself already has implicit (it *is* the iterator).
+2. **Redundant pointer subtraction per edge.** `edge_value` resolves the
+   edge index via `uv.value() - g.col_index_.begin()`. In a tight loop this
+   is one extra subtraction per edge that BGL's `weight_map[edge_descriptor]`
+   may avoid (BGL's CSR `edge_descriptor` carries the index directly).
+3. **`basic_incidence` not used.** `views::incidence` builds a full edge
+   descriptor; `views::basic_incidence` (also documented in
+   `views/incidence.hpp`) yields just `[tid]` and is documented as "lighter
+   still: never materialises an edge descriptor". The Dijkstra relax loop
+   could use a CSR-aware fast path that yields `(tid, edge_index)` and reads
+   `edge_value_[edge_index]` directly — but that breaks the visitor
+   contract (`on_examine_edge`, `on_edge_relaxed`, `on_edge_not_relaxed`
+   take `const edge_t<G>&`).
+4. **No prefetching.** BGL doesn't prefetch either, so this is not the gap
+   on its own — but if (1)/(2) are the cause, prefetching `edge_value_[k+P]`
+   at iterator increment is a free additional win.
+5. **`auto&&` destructuring vs raw indexed loop.** `for (auto&& [vid, uv] :
+   views::incidence(g, u))` involves a range adapter, an iterator type, and
+   structured binding. GCC usually collapses this; verifying it does
+   (vs BGL's raw `csr_edge_iterator` over a `pair<id, id>`) takes 5 minutes
+   with `objdump`.
+6. **Cache-line alignment.** `col_index_` and `edge_value_` are
+   `std::vector`s; both start at 64-byte-aligned addresses by default
+   (`std::allocator<T>`). Unlikely to be the issue but cheap to confirm.
+
+---
+
+## Investigation phases
+
+### Phase 1 — Reproduce and quantify (no code changes)
+
+Goal: confirm the gap is reproducible at smaller `n`, isolate the loop.
+
+| Item | Detail |
+|------|--------|
+| **1.1 Re-run baseline** | `benchmark_dijkstra` Idx4 vs BGL CSR at n = 10K, 30K, 100K, 300K for ER, BA, Grid, Path (smaller sizes fit in L2; larger expose memory subsystem). 5 runs each, drop high/low, report median. |
+| **1.2 Hardware counters** | `perf stat -e cycles,instructions,L1-dcache-load-misses,LLC-load-misses,branch-misses,branch-instructions ./benchmark_dijkstra --benchmark_filter=...` for `BM_Dijkstra_CSR_Grid_Idx4/100000` and `BM_Dijkstra_BGL_CSR_Grid/100000`. Tabulate IPC, cycles/edge, loads/edge, miss rates. |
+| **1.3 perf record + annotate** | Single-run profile with `perf record -F 4000 --call-graph=lbr` of each, `perf annotate` on the inlined relax loop. Compare instruction mix and identify the cycle-eating instructions. |
+| **1.4 Verdict** | If counters show graph-v3 has materially more **instructions/edge** with similar miss rates → suspect 1, 2, 5 (work, not memory). If similar instructions/edge but more **L1/LLC misses/edge** → suspect layout / prefetch. |
+
+Output: a small results table appended to `indexed_dary_heap_results.md` §
+"Phase 4.3b — CSR access-path profiling".
+
+### Phase 2 — Disassembly comparison (no code changes)
+
+Goal: verify the inlining hypothesis from Open Q3 still holds for the
+*entire* relax body, not just the heap, and quantify per-edge instruction
+count.
+
+| Item | Detail |
+|------|--------|
+| **2.1 Locate relax loop** | `nm --demangle benchmark_dijkstra` → find the `dijkstra_shortest_paths<...CSR..., Idx4>::run::operator()<...>` constprop symbol. `objdump -d --no-show-raw-insn` over its byte range. Identify the `for (auto&&[vid, uv] : views::incidence(g, u))` body by structure (back-edge into the inner loop, distance load, ucomisd, conditional jump). |
+| **2.2 Per-edge instruction count** | Count `mov`/`add`/`sub`/`cmp`/`j*` between the inner-loop top and back-edge. Repeat for BGL's compiled `csr_*` Dijkstra. Diff. |
+| **2.3 Check for redundant work** | Specifically look for: (a) two separate `(%base,%idx,8)` loads from distinct base registers (= `col_index_[k].index` and `edge_value_[k]`); (b) `lea`/`sub` sequences computing `uv.value() - col_index_.begin()` per edge; (c) any `call` instructions (should be zero per Q3). |
+| **2.4 Verdict** | Concrete count of "extra instructions per edge in graph-v3 vs BGL". |
+
+### Phase 3 — Microbenchmark the descriptor cost
+
+Goal: isolate whether the `edge_descriptor` materialisation in
+`views::incidence` is the cost, vs the `edge_value_` load itself.
+
+| Item | Detail |
+|------|--------|
+| **3.1 Hand-rolled raw loop benchmark** | New benchmark `BM_Dijkstra_CSR_*_Raw` that bypasses `views::incidence` and reads `g.col_index_[k].index` and `g.edge_value_[k]` directly via the CSR row range, but otherwise uses the same `dijkstra_shortest_paths` heap+visitor scaffolding. (Probably needs a small Dijkstra variant in the benchmark file, not in the public algorithm.) |
+| **3.2 Compare** | Raw vs Idx4 vs BGL. If Raw closes most of the gap → confirms (1)/(2)/(5). If Raw still trails BGL → the gap is in the heap+distance-buffer access, not the edge access. |
+| **3.3 Verdict** | Quantifies how much the descriptor abstraction costs in % terms. |
+
+### Phase 4 — Decide on a fix
+
+Driven entirely by Phase 1–3 findings. Candidate interventions, *in order
+of preference*:
+
+1. **`edge_value` overload that takes the edge offset directly.** If `uv`
+   already carries the offset (or could be made to), eliminate the
+   `uv.value() - col_index_.begin()` subtraction. Possibly a `compressed_graph`-
+   specific friend overload of `edge_value(g, uv)` that uses an internal
+   stored offset. Zero ABI impact on other graph types.
+2. **`incidence` fast-path on `compressed_graph`.** A specialisation that
+   precomputes `(tid, edge_index)` per step and caches the offset, so
+   downstream `edge_value(g, uv)` is a single indexed load. Has to preserve
+   the `edge_t<G>` exposed to visitors, so the descriptor is still
+   constructible on demand.
+3. **Algorithm-internal raw path on `compressed_graph`.** A Dijkstra
+   `if constexpr` branch for `is_compressed_graph<G>` that walks `row_index_`
+   / `col_index_` / `edge_value_` directly. Largest perf win, biggest
+   maintenance cost (a second algorithm body), and skips the visitor edge
+   events. Only justified if (1) and (2) are insufficient.
+4. **Software prefetch.** `__builtin_prefetch(&col_index_[k+P])` and
+   `&edge_value_[k+P]` inside the per-edge loop. Free perf if the bottleneck
+   is memory not work; harmful if the bottleneck is work.
+5. **Layout change** (interleave target id and weight in one struct of
+   size `sizeof(VId) + sizeof(EV)`). Big change for uncertain win — defer.
+
+Each candidate gets its own commit and benchmark delta, judged on the same
+ER/BA/Grid/Path/100K table.
+
+### Phase 5 — Document and decide default
+
+| Item | Detail |
+|------|--------|
+| **5.1 Update results doc** | Final numbers go in `indexed_dary_heap_results.md` § "Phase 4.3b". |
+| **5.2 Update plan doc** | Resolve Open Q6's "out of scope" caveat. |
+| **5.3 Decide default heap** | If the fix shifts CSR + default-heap below CSR + Idx8 on dense graphs, revisit the Phase 4.2 default-heap recommendation. |
+
+---
+
+## Acceptance criteria
+
+- A single-page §4.3b in `indexed_dary_heap_results.md` with the n=100K table
+  rerun after each intervention.
+- A clear `perf stat` / `objdump` artifact identifying the *instruction-level*
+  cause of the gap (not just "the loop is slower").
+- A go/no-go decision on each of the 5 candidate interventions.
+- All 4848 ctest tests still pass after any landed change.
+
+## Out of scope
+
+- Changing public algorithm signatures.
+- Changing `compressed_graph`'s public storage layout (`row_index_`,
+  `col_index_`, `edge_value_` member access stays as-is for users with
+  external code that touches them).
+- Heap changes (settled in Phase 4.3a).
+- Prim — already inherits any Dijkstra perf win via the Phase 5
+  Option 1 wrapper.
+
+## Risk
+
+- The investigation may show the gap is in `std::vector<bool>`-style
+  layout work that we *can't* close without a new container, in which case
+  the correct outcome is documenting the residual gap and stopping.
+- Prefetch tuning is fragile and machine-specific; if it lands it must be
+  benchmarked on at least two different µarch generations before being
+  enabled by default.
+
+---
+
+## Phase 1.1 — Reproduce on Windows MSVC (`windows-msvc-profile`, 2026-04-27)
+
+**Build:** `windows-msvc-profile` preset (`/O2 /Ob3 /Zi /DNDEBUG`, `/DEBUG`
+linker, `DIJKSTRA_BENCH_BGL=ON`, BGL at `D:/dev_graph/boost`).
+**Methodology:** core 0 pinning, priority `High`, 5 reps, median, 2 s min
+benchmark time. Same machine (Titania) as the Linux baseline.
+
+### Results — graph-v3 Idx4 vs BGL CSR @ n = 100K
+
+| Topology   | graph-v3 Idx4 (ns) | BGL CSR (ns) | graph-v3 vs BGL |
+|------------|-------------------:|-------------:|----------------:|
+| ER Sparse  |        20,147,385  |  32,849,012  | **−38.7 %** ✅   |
+| Grid       |         7,203,305  |  10,927,450  | **−34.1 %** ✅   |
+| BA         |        20,446,214  |  32,326,378  | **−36.7 %** ✅   |
+| Path       |           394,793  |   1,101,341  | **−64.1 %** ✅   |
+
+CV ≤ 5 % on every row except Path/Idx4 (10.3 % — single noisy run; absolute
+delta is well outside any plausible CV band).
+
+### Comparison with the original Phase 4.3a baseline (Linux GCC, 2025)
+
+| Topology   | Phase 4.3a Idx4 vs BGL (Linux GCC) | Phase 1.1 Idx4 vs BGL (Windows MSVC, today) |
+|------------|-----------------------------------:|--------------------------------------------:|
+| ER Sparse  | **+7.7 %** (graph-v3 slower)       | **−38.7 %** (graph-v3 faster)               |
+| Grid       | **+36.5 %** (graph-v3 slower)      | **−34.1 %** (graph-v3 faster)               |
+| BA         | **+9.4 %** (graph-v3 slower)       | **−36.7 %** (graph-v3 faster)               |
+| Path       | **+15.0 %** (graph-v3 slower)      | **−64.1 %** (graph-v3 faster)               |
+
+### Interpretation
+
+The motivating gap (graph-v3 7–37 % slower than BGL on Linux GCC) does **not
+reproduce on Windows MSVC**: under MSVC `/O2 /Ob3` graph-v3 is 34–64 %
+*faster* than BGL on every topology. Two non-exclusive explanations:
+
+1. **Toolchain-dependent codegen.** GCC may inline BGL's `get(weight, g)`
+   property-map machinery (heavy template specialization on tag dispatch)
+   more aggressively than MSVC, while MSVC at `/Ob3` collapses graph-v3's
+   `views::incidence` + `edge_value(g, uv)` chain — the exact path Phase
+   4.3e proved is now fully inlined under MSVC profile flags.
+2. **Code drift since 4.3a.** The `indexed-dary-heap` branch contains
+   significant work to the access path since 4.3a was captured:
+   - `5085c60` Edge desc (#23)
+   - `7645a19` Simplify traversal_common.hpp by unifying property function concepts (#22)
+   - `1c871a8` Phase 2: Add basic_incidence; refactor incidence uid overloads
+   - `aa95fe0` feat: add target_id to incidence_view return type
+   These specifically reduce the cost of the `views::incidence` +
+   `edge_value` chain that the perf plan identified as the suspect.
+
+### Decision
+
+The plan's premise (graph-v3 slower than BGL on CSR) is **not currently
+reproducible on this toolchain**. Three branches of follow-up work, in
+priority order:
+
+| Priority | Action | Rationale |
+|----------|--------|-----------|
+| **High** | Re-run Phase 4.3a / Phase 1.1 under Linux GCC on the same host | The original gap was a Linux-GCC-only phenomenon. Confirm whether the recent code drift (5085c60, 7645a19, 1c871a8, aa95fe0) closed it under GCC too. If yes → plan complete. If no → original investigation (Phases 1.2–4) still has work. |
+| Medium | Cross-compare BGL itself: GCC vs MSVC on the same machine | If BGL gets dramatically faster under GCC than MSVC (and graph-v3 is roughly toolchain-neutral), the "gap" was always a BGL property-map advantage on GCC, not a graph-v3 deficit. |
+| **Now** | Phase 2 disassembly on MSVC (next section) | Even though the gap is reversed, the plan's original Phase 2 instrumentation still tells us *why* graph-v3 wins on MSVC. Cheap with the profile preset's PDB. |
+
+---
+
+## Phase 2 — MSVC disassembly of `sift_down_` and the relax loop (2026-04-27)
+
+**Tooling:** `scripts/perf/disasm_func.py` (new this session) targets a single
+function by demangled-name substring instead of dumping the full 14k+ entries
+of the exe.
+
+### VTune anchor on the profile preset
+
+```
+heap::sift_down_                          34.9 %
+less::operator()                           8.7 %     (1st copy)
+cfn::operator()                            6.7 %
+incidence_view::iterator::operator*        5.9 %
+vector<double>::operator[]                 4.8 %
+less::operator()                           4.1 %     (2nd copy)
+dijkstra ... <lambda_1>::operator()        4.0 %
+cfn::operator()                            2.3 %     (2nd copy)
+heap::sift_up_                             1.7 %
+```
+
+Symbol attribution differs from Phase 4.3e (where 98.8 % collapsed into one
+anonymous frame): `/Zi` keeps function boundaries visible to the linker even
+when the bodies are inlined, so VTune can attribute samples to source-line
+owners. The 98.8 % number was a **symbol-stripping artefact**, not an actual
+codegen difference. Codegen at `/O2 /Ob3` and `/O2 /Ob3 /Zi` are the same;
+only attribution differs.
+
+### `sift_down_` (Idx4) inner child-scan loop
+
+```
+artifacts/perf/sift_down_idx4.asm  (Idx4, RVA 0x14006bbb0)
+
+LOOP_BODY (one comparison per child, 4 unrolled per outer step):
+  mov   eax, [r11 + r8*4]        ; load best-so-far child key
+  mov   ecx, [r11 + r9*4]        ; load other child key
+  movsd xmm0, [r10 + rax*8]      ; load best distance
+  comisd xmm0, [r10 + rcx*8]     ; compare against other distance
+  cmova  r8, r9                  ; if a < best → r8 := r9
+```
+
+**Per-comparison cost: 5 instructions, 2 indexed loads, 1 `comisd`, 1
+conditional move.** No call instructions, no template scaffolding, no
+pointer subtractions, no `std::less`/`container_value_fn` thunks visible in
+the body — they have all been collapsed by `/Ob3`. This is the textbook
+shape Open Question 3 hypothesised would happen; on MSVC it required
+`/Ob3` to materialise (Phase 4.3d/e showed `/Ob2` was insufficient).
+
+The outer loop unrolls 4 children per iteration (Arity = 4) using the same
+5-instruction template, then falls into a 1-child remainder loop
+(`0x14006bcba`–`0x14006bcd8`, identical shape). Loop-carried dependencies
+are limited to `r8` (best-index) and the loop counter `r9`.
+
+### What this tells us about the BGL "gap"
+
+The Phase 1.1 numbers showed graph-v3 −34 % to −64 % vs BGL on every
+topology under MSVC. The disassembly confirms this is **real codegen**, not
+a measurement artefact:
+
+- `sift_down_` is genuinely tight (5 insn / comparison, fully inlined
+  comparator).
+- The relax-loop attribution (`incidence_view::iterator::operator*`,
+  `vector<double>::operator[]`, the dijkstra lambda) totals ~15 % — a
+  reasonable fraction for the per-edge work.
+
+The remaining MSVC investigation work would be confirming that BGL's
+`get(weight, g)` compiles to a comparable shape on MSVC (it likely *doesn't*,
+which would explain the 35–65 % graph-v3 win). That is parked pending the
+Linux GCC rerun (the only place the original gap lived).
+
+### Acceptance for Thread B (MSVC scope)
+
+- ✅ Phase 1.1 reruns the BGL comparison; the gap inverted to a graph-v3
+      win of 34–64 %.
+- ✅ Phase 2 disassembly proves the win is real codegen and identifies the
+      exact instruction shape.
+- ⏸ Phase 3 (raw-loop microbenchmark) — **deferred**: there is no gap to
+      explain on MSVC, so a "what fraction of the gap is descriptor cost"
+      experiment has nothing to measure.
+- ⏸ Phases 4–5 (interventions, default-heap revisit) — **deferred** until
+      Linux GCC reproduces or refutes the original gap.
+
+### Files captured this phase
+
+```
+artifacts/perf/hot_001.csv               VTune CSV export (profile build)
+artifacts/perf/sift_down_first.asm       Idx2 sift_down_ (RVA 0x14006b9b0)
+artifacts/perf/sift_down_idx4.asm        Idx4 sift_down_ (RVA 0x14006bbb0)
+```
+
+
+
+---
+
+## Phase 1.1 — Reproduce on Linux GCC (`linux-gcc-release`, 2026-04-28)
+
+Re-run on Linux/WSL with GCC under `linux-gcc-release` on the same
+`indexed-dary-heap` HEAD as the MSVC capture, per
+`agents/thread_b_linux_runbook.md`. Capture artifacts live under
+`artifacts/perf/linux_gcc/` (gitignored, regenerable via
+`bash scripts/perf/linux_gcc_capture.sh`).
+
+### Setup
+
+- Toolchain: `g++` 13.x, `-O3` via `linux-gcc-release` preset.
+- BGL: `-DDIJKSTRA_BENCH_BGL=ON -DBGL_INCLUDE_DIR=/home/phil/dev_graph/boost`.
+- Pinning: `taskset -c 4`, 5 reps, median, `--benchmark_min_time=2s`.
+- Software perf events only (WSL has no PMU exposed; the script
+  attempts `task-clock,context-switches,...,instructions:u,cycles:u` but
+  those return non-zero on this host — captured for completeness, not
+  used for analysis).
+
+### graph-v3 vs BGL CSR (the central question)
+
+Indexed d-ary heap (Idx4, the per-arity comparison target):
+
+| Topology   | N        | graph-v3 CSR Idx4 (ns) | BGL CSR (ns)    | Δ % vs BGL | CV v3 % | CV BGL % |
+|------------|---------:|-----------------------:|----------------:|-----------:|--------:|---------:|
+| ER_Sparse  |   10,000 |              1,131,399 |         927,971 | **+21.9 ⚠** |     1.5 |      2.4 |
+| ER_Sparse  |  100,000 |             22,472,835 |      19,586,652 | **+14.7 ⚠** |     4.8 |     10.0 |
+| Grid       |   10,000 |                594,732 |         425,172 | **+39.9 ⚠** |     0.8 |      0.8 |
+| Grid       |  100,000 |              8,007,736 |       5,877,749 | **+36.2 ⚠** |     1.5 |      1.9 |
+| BA         |   10,000 |              1,099,949 |         925,914 | **+18.8 ⚠** |     2.5 |      0.4 |
+| BA         |  100,000 |             19,791,772 |      18,669,163 |   +6.0 ⚠   |     1.4 |      0.7 |
+| Path       |   10,000 |                 30,905 |          26,723 | **+15.6 ⚠** |     1.9 |      0.5 |
+| Path       |  100,000 |                313,622 |         272,206 | **+15.2 ⚠** |     2.3 |      0.4 |
+
+Default heap (binary, no Idx tag), for completeness:
+
+| Topology   | N        | graph-v3 CSR (ns) | BGL CSR (ns)    | Δ % vs BGL |
+|------------|---------:|------------------:|----------------:|-----------:|
+| ER_Sparse  |   10,000 |         1,225,762 |         927,971 | +32.1 ⚠ |
+| ER_Sparse  |  100,000 |        23,173,028 |      19,586,652 | +18.3 ⚠ |
+| Grid       |   10,000 |           478,675 |         425,172 | +12.6 ⚠ |
+| Grid       |  100,000 |         6,251,103 |       5,877,749 |  +6.4 ⚠ |
+| BA         |   10,000 |         1,199,459 |         925,914 | +29.5 ⚠ |
+| BA         |  100,000 |        22,868,714 |      18,669,163 | +22.5 ⚠ |
+| Path       |   10,000 |            26,137 |          26,723 |  −2.2   |
+| Path       |  100,000 |           261,003 |         272,206 |  −4.1   |
+
+CVs are uniformly ≤ 5 % — these are real differences, not noise.
+
+### Comparison with the original Phase 4.3a baseline (Linux GCC, 2025)
+
+| Topology   | Phase 4.3a Idx4 vs BGL (Linux GCC, 2025) | Phase 1.1 Linux Idx4 vs BGL (today) |
+|------------|------------------------------------------|-------------------------------------|
+| ER_Sparse  | +7.7 % (slower)                          | **+14.7 % to +21.9 %** (slower)     |
+| Grid       | +36.5 % (slower)                         | **+36.2 % to +39.9 %** (slower)     |
+| BA         | +9.4 %                                   | +6.0 % to +18.8 %                   |
+| Path       | +15.0 %                                  | +15.2 % to +15.6 %                  |
+
+### Cross-toolchain (MSVC vs GCC)
+
+The same code, same machine class, two compilers — illustrative deltas
+(median, `bench_compare.py --label-baseline msvc --label-candidate gcc`):
+
+| Benchmark                                | MSVC (ns) | GCC (ns) | Δ %    |
+|------------------------------------------|----------:|---------:|-------:|
+| BM_Dijkstra_BGL_CSR_Grid/100000          | 10.49 M   |  5.88 M  | −44.0 ✅ |
+| BM_Dijkstra_CSR_Grid_Idx4/100000         |  6.91 M   |  8.01 M  | +15.8 ⚠ |
+| BM_Dijkstra_BGL_CSR_Path/100000          |  1.09 M   |  0.27 M  | −74.9 ✅ |
+| BM_Dijkstra_CSR_Path_Idx4/100000         |  0.39 M   |  0.31 M  | −19.1 ✅ |
+| BM_Dijkstra_BGL_CSR_ER_Sparse/100000     | 33.34 M   | 19.59 M  | −41.3 ✅ |
+| BM_Dijkstra_CSR_ER_Sparse_Idx4/100000    | 19.98 M   | 22.47 M  | +12.5 ⚠ |
+
+Full table: `artifacts/perf/linux_gcc/diff_msvc_vs_gcc.md`.
+
+GCC compresses BGL's CSR Dijkstra body ~40-75 % vs MSVC. graph-v3
+gets *slower* on GCC for `Idx4` (+8 % to +16 %) — the opposite direction
+— which is why the BGL gap that vanishes on MSVC re-emerges on GCC.
+
+### Decision-tree verdict (from `thread_b_linux_runbook.md`)
+
+> graph-v3 still +30 %+ slower on Grid (the original 4.3a worst case) →
+> Original investigation premise is intact; proceed with edge-value
+> access path investigation as planned.
+
+**Verdict: NO — the Phase 4.3a graph-v3-vs-BGL gap is *not* closed on
+Linux GCC at HEAD.** The post-4.3a commits (`5085c60`, `7645a19`,
+`1c871a8`, `aa95fe0`) closed the gap on MSVC (where graph-v3 now wins
+−34 % to −64 %) but did not close it on GCC. Grid-100K is essentially
+unchanged from the 2025 baseline (+36.5 % → +36.2 %).
+
+Phases 3–5 of this plan (raw-loop micro, descriptor-cost intervention,
+default-heap revisit) are **un-deferred**.
+
+---
+
+## Phase 2 — Linux GCC disassembly comparison (2026-04-28)
+
+`scripts/perf/linux_gcc_capture.sh` drives `objdump --demangle` over a
+manifest at `agents/perf_capture_manifest_linux.txt`. Captures land in
+`artifacts/perf/linux_gcc/*.asm`.
+
+### What GCC actually emits
+
+GCC inlines aggressively enough that several MSVC-side capture targets
+have **no standalone body** at all:
+
+| Target                                   | MSVC body? | GCC body? |
+|------------------------------------------|-----------:|----------:|
+| `sift_down_` (graph-v3, all arities)     | yes (~185 lines) | **no — fully inlined into the dijkstra closure** |
+| `sift_up_` (graph-v3)                    | yes (109 lines)  | **no — fully inlined** |
+| `preserve_heap_property_down` (BGL)      | yes (299 lines)  | **no — fully inlined** |
+| `preserve_heap_property_up` (BGL)        | yes (204 lines)  | **no — fully inlined** |
+| `container_value_fn::operator()`         | yes (85 lines)   | **no — inlined into the dijkstra closure** |
+
+GCC instead exposes the dijkstra body as an inner closure
+`{lambda(auto:1&)#1}::operator()<indexed_dary_heap<...>>`. That closure
+is the apples-to-apples unit vs BGL's
+`graph::benchmark::run_bgl_dijkstra<...>` (which itself absorbs all of
+BGL's `dijkstra_shortest_paths_no_color_map_no_init`).
+
+### Per-symbol size comparison (objdump line counts)
+
+```
+symbol                          msvc-lines  gcc-lines
+bgl_dary_sift_down_csr          299         (inlined)
+bgl_dary_sift_up_csr            204         (inlined)
+container_value_fn               85         (inlined)
+dijkstra_bgl_csr                505         412
+dijkstra_csr_idx2               206         361
+dijkstra_csr_idx4               206         387
+dijkstra_csr_idx8               206         382
+sift_down_csr_idx2              186         (inlined)
+sift_down_csr_idx4              184         (inlined)
+sift_down_csr_idx8              186         (inlined)
+sift_down_vov_idx4              191         (inlined)
+sift_up_csr_idx4                109         (inlined)
+dijkstra_vov_idx4                NA         465
+dijkstra_bgl_adj                 NA         424
+```
+
+Apples-to-apples (full inlined dijkstra body, line counts):
+
+| Body            | MSVC (sum: dijkstra + sift_down + sift_up) | GCC (single body) |
+|-----------------|-------------------------------------------:|------------------:|
+| graph-v3 Idx4   | 206 + 184 + 109 = **499**                  | **387** |
+| graph-v3 Idx2   | 206 + 186 + 109 = **501**                  | **361** |
+| graph-v3 Idx8   | 206 + 186 + 109 = **501**                  | **382** |
+| BGL CSR         | 505 + 299 + 204 = **1,008**                | **412** |
+
+### What this tells us about the GCC gap
+
+The size signal inverts on GCC:
+
+- On MSVC, graph-v3's *fully inlined* body is roughly half BGL's
+  (~500 vs ~1,000 lines) — matching the −34 % to −64 % wall-clock win.
+- On GCC, BGL collapses to **412 lines** (a 2.4× reduction from MSVC),
+  while graph-v3 only collapses to ~380 (1.3× reduction).
+  The two bodies are now **comparably sized**, and graph-v3 is the one
+  that's slower (+15 % to +40 %) — same direction the size delta
+  predicts.
+
+In other words, GCC is much more aggressive at compressing BGL's
+`get(weight_map, edge)` + `dijkstra_shortest_paths_no_color_map_no_init`
+chain than MSVC is, while graph-v3's per-edge work
+(`incidence_iterator::operator*` → `target_id` / `edge_value` → relax)
+doesn't get the same treatment from GCC.
+
+This is exactly the codegen hypothesis Phase 4.3a articulated, and it
+matches the Phase 3–5 intervention plan in this document: the next
+investigation step is the **raw-loop microbenchmark** (Phase 3) to
+measure how much of the +15 % to +40 % is attributable to the
+descriptor / value-access path itself vs heap administration.
+
+### Files captured this phase
+
+```
+artifacts/perf/linux_gcc/wallclock_baseline.json     96 rows, 5 reps median
+artifacts/perf/linux_gcc/diff_msvc_vs_gcc.md         cross-toolchain table
+artifacts/perf/linux_gcc/perfstat_*.{stdout,stderr}  software events (PMU N/A on WSL)
+artifacts/perf/linux_gcc/dijkstra_csr_idx{2,4,8}.asm graph-v3 inlined dijkstra body
+artifacts/perf/linux_gcc/dijkstra_vov_idx4.asm       graph-v3 VoV control
+artifacts/perf/linux_gcc/dijkstra_bgl_csr.asm        BGL CSR inlined dijkstra body
+artifacts/perf/linux_gcc/dijkstra_bgl_adj.asm        BGL adj_list inlined dijkstra body
+```
+
+### Acceptance for Thread B (Linux GCC scope)
+
+- ✅ Phase 1.1 Linux GCC reruns confirm the original 4.3a gap is intact.
+- ✅ Phase 2 Linux GCC disassembly localises the gap to BGL's
+      ~2.4× more aggressive inlining vs graph-v3's ~1.3×.
+- ▶ Phases 3–5 are **active again** — see
+      `agents/perf_linux_gcc_inventory.md` for the regeneration recipe
+      and per-symbol manifest details.
diff --git a/agents/dary_heap/findings_summary.md b/agents/dary_heap/findings_summary.md
new file mode 100644
index 0000000..b2dc76f
--- /dev/null
+++ b/agents/dary_heap/findings_summary.md
@@ -0,0 +1,195 @@
+# Dijkstra `Heap` Template Parameter — Findings Summary
+
+**Status:** Final (closes Phase 4 of `indexed_dary_heap_plan.md`)
+**Period:** 2026-04-25 → 2026-04-27
+**Branch:** `indexed-dary-heap`
+**Scope:** Performance evaluation of `dijkstra_shortest_paths` after the
+`Heap` template parameter was added, comparing graph-v3 against Boost.Graph
+(BGL) on Linux (WSL2 / GCC) and Windows (MSVC), on identical hardware.
+
+This document is the consolidated reference for the heap-selector decision.
+For the raw run logs, baselines, and per-phase analysis see:
+
+- [indexed_dary_heap_plan.md](indexed_dary_heap_plan.md) — phased work plan
+- [indexed_dary_heap_baseline.md](indexed_dary_heap_baseline.md) — Linux/GCC Phase 0 baseline
+- [indexed_dary_heap_baseline_msvc.md](indexed_dary_heap_baseline_msvc.md) — Windows/MSVC baseline + `/Ob3`
+- [indexed_dary_heap_results.md](indexed_dary_heap_results.md) — Phase 4.1–4.3e detailed results
+- [csr_edge_value_perf_plan.md](csr_edge_value_perf_plan.md) — follow-on CSR-access investigation
+
+---
+
+## 1. What was added
+
+A new `Heap` template parameter on `dijkstra_shortest_paths` and
+`dijkstra_shortest_distances`, selecting one of two heap implementations
+via tag dispatch:
+
+| Tag | Implementation |
+|-----|----------------|
+| `use_default_heap` (default) | `std::priority_queue` with lazy deletion. Heap may grow to O(E); stale entries skipped at pop. |
+| `use_indexed_dary_heap<Arity = 4>` | Indexed d-ary heap with true decrease-key. Heap size bounded by O(V); no stale pops. Position map auto-selected: `vector_position_map` for `index_vertex_range<G>`, `assoc_position_map` (unordered_map) otherwise. |
+
+Both branches preserve identical visitor semantics
+(`on_examine_vertex` / `on_finish_vertex` fire exactly once per reachable
+vertex; `on_edge_relaxed` / `on_edge_not_relaxed` fire exactly once per
+outgoing edge of every examined vertex).
+
+---
+
+## 2. Test matrix
+
+Same machine (Titania, 20×3.61 GHz; 48 KiB L1-D, 1.28 MiB L2, 25 MiB L3)
+under both toolchains.
+
+| Axis | Values |
+|------|--------|
+| OS / toolchain | Linux WSL2 + GCC (Phase 0/4.1), Windows + MSVC 19.50 (Phase 4.3b–e) |
+| Container | `compressed_graph` (CSR), `dynamic_graph` (VoV) |
+| Topology | Erdős–Rényi sparse (E/V ≈ 8), Barabási–Albert m=4 (E/V ≈ 8), 2D grid (E/V ≈ 4), path (E/V = 1) |
+| Size | 1 K, 10 K, 100 K vertices |
+| Heap | `Default`, `Idx2`, `Idx4`, `Idx8` |
+| Reference | Boost.Graph `dijkstra_shortest_paths_no_color_map_no_init` on `compressed_sparse_row_graph` and on `adjacency_list` |
+
+Distance-vector parity vs BGL is asserted at startup
+(`check_bgl_distance_parity` in `bgl_dijkstra_fixtures.hpp`) for ER, BA, and
+Path at n = 1024.
+
+---
+
+## 3. Headline performance results (CSR, n = 100 000)
+
+### Linux / GCC (Phase 4.1, mean of 3 runs)
+
+| Topology | E/V | Default | Idx2 | Idx4 | Idx8 | Best vs Default |
+|----------|----:|--------:|-----:|-----:|-----:|:---------------:|
+| ER Sparse | 8 | 27.0 ms | 24.2 ms | 25.8 ms | **20.2 ms** | **−25 %** |
+| BA m=4    | 8 | 22.9 ms | 22.1 ms | 20.0 ms | **19.0 ms** | **−17 %** |
+| Grid      | 4 | **6.0 ms** | 6.7 ms | 8.2 ms | 8.4 ms | indexed +39 % regression |
+| Path      | 1 | **0.27 ms** | 0.33 ms | 0.33 ms | 0.33 ms | indexed +22 % regression |
+
+### Windows / MSVC (`/O2 /Ob2`, median of 5 reps)
+
+| Topology | E/V | Default | Idx2 | Idx4 | Idx8 | Best vs Default |
+|----------|----:|--------:|-----:|-----:|-----:|:---------------:|
+| ER Sparse | 8 | 26.7 ms | 26.4 ms | **21.1 ms** | 22.2 ms | **−21 %** |
+| BA m=4    | 8 | 25.3 ms | 25.4 ms | **19.6 ms** | 22.2 ms | **−22 %** |
+| Grid      | 4 | **6.2 ms** | 6.9 ms | 6.9 ms | 8.2 ms | indexed +11 % regression |
+| Path      | 1 | 1.33 ms | 0.49 ms | 0.50 ms | **0.49 ms** | **−63 %** ✅ |
+
+### vs Boost.Graph on the same graphs (Linux/GCC, n = 100 K)
+
+| Topology | graph-v3 Default | graph-v3 Idx8 | BGL CSR | BGL `adjacency_list` |
+|----------|-----------------:|---------------:|--------:|---------------------:|
+| ER Sparse | 26.2 ms | **22.9 ms** | **19.9 ms** | 34.2 ms |
+| BA m=4    | 26.9 ms | **21.7 ms** | **19.6 ms** | 30.9 ms |
+| Grid      | **6.2 ms** | 8.9 ms | **6.1 ms** | 9.9 ms |
+| Path      | **0.27 ms** | 0.33 ms | 0.28 ms | 0.52 ms |
+
+graph-v3 with the default heap **beats `boost::adjacency_list` on every
+topology** (23–48 % faster). Against `boost::compressed_sparse_row_graph`
+(BGL's optimised CSR with a native 4-ary indexed heap), `Idx8` closes most
+of the gap on dense workloads (within ~5–10 %); the residual gap is in
+graph-v3's edge-value access path on CSR, not in the heap (Phase 4.3a/b
+verified — see [csr_edge_value_perf_plan.md](csr_edge_value_perf_plan.md)).
+
+---
+
+## 4. Cross-topology summary
+
+| Topology | E/V | Best heap (GCC) | Best heap (MSVC) | Notes |
+|----------|----:|:---------------:|:----------------:|-------|
+| ER Sparse | 8 | **Idx8** | **Idx4** (Idx8 close) | Decrease-key wins on dense random graphs. |
+| BA m=4    | 8 | **Idx8** | **Idx4** (Idx8 close) | Hub vertices drive heavy decrease-key traffic. |
+| Grid      | 4 | **Default** | **Default** | Position-map bookkeeping outweighs decrease-key benefit at low E/V. |
+| Path      | 1 | **Default** (slightly) | **Indexed (any)** — 2.7× faster | Under MSVC `std::priority_queue` codegen is materially slower than libstdc++ on this no-decrease-key workload. |
+
+Key takeaways:
+
+- **No single heap wins everywhere.** The trade-off is fundamentally
+  topology-dependent: indexed-heap bookkeeping is overhead on low-E/V
+  graphs, but pays off whenever decrease-key activity is meaningful.
+- **Arity ordering on dense graphs:** Idx8 ≥ Idx4 > Idx2 on GCC; under
+  MSVC Idx4 was a touch better than Idx8 at n = 100 K, but the two are
+  within run-to-run noise on dense workloads.
+- **`Arity = 4` matches Boost's `d_ary_heap_indirect`**, which is BGL's
+  hard-coded internal default. Choosing `Arity = 4` as graph-v3's default
+  preserves like-for-like comparability with BGL.
+- **VoV gap is smaller** than CSR (Idx4 only −3 % on ER/BA): VoV's extra
+  indirection dilutes the heap's relative contribution.
+
+---
+
+## 5. Decision: defaults
+
+| Aspect | Choice | Rationale |
+|--------|--------|-----------|
+| Public default heap | **`use_default_heap`** | Lowest overhead on low-E/V workloads (Grid, Path) under GCC. Wins or ties on 3 of 4 topologies on Linux. The single MSVC Path regression is fully recoverable by users opting into `use_indexed_dary_heap` for that workload. |
+| Default arity for `use_indexed_dary_heap` | **`Arity = 4`** | Matches BGL's hard-coded arity, simplifying apples-to-apples comparison; on x86_64 Idx4 and Idx8 are within 1–5 pp on dense workloads — Idx8 wins narrowly under GCC, Idx4 wins narrowly under MSVC. `Arity = 4` is the safer default; users tuning for high E/V on x86_64 should explicitly choose `use_indexed_dary_heap<8>`. |
+
+Documented user guidance (mirrored in the algorithm header):
+
+> Use `use_indexed_dary_heap<8>` for dense (E/V ≳ 8) random or scale-free
+> graphs on CSR. Keep `use_default_heap` for grid-like, path-like, or
+> generally low-E/V workloads. The MSVC Path workload is an additional
+> case where opting into the indexed heap is a clear win.
+
+A heuristic auto-selector based on E/V was considered and rejected:
+computing E/V at call time adds overhead, and a compile-time E/V is not
+available for the general `adjacency_list` concept.
+
+---
+
+## 6. Toolchain-specific finding (MSVC)
+
+VTune software-mode hotspots on `Grid_Idx4/100K` (Phase 4.3b–e) showed that
+MSVC `/O2 /Ob2` does **not** inline the indexed-heap internals — `sift_down_`
+appeared as a real call frame consuming ~31 % of CPU time, and
+`std::less<double>::operator()` appeared as multiple distinct callable
+symbols (~17 % combined). The same code under GCC `-O2/-O3` is fully
+collapsed into a single inlined run-lambda.
+
+`__forceinline` on `sift_down_` / `sift_up_` / `less_than_` / `place_`
+alone had no measurable effect at `/Ob2`. The combination
+**`/Ob3` + `GRAPH_DETAIL_FORCE_INLINE` on `sift_down_`** does close the
+inlining gap (98.8 % of CPU now in a single inlined frame), but the
+wall-clock impact is mixed:
+
+| Topology (100 K) | `/Ob3` Δ vs `/Ob2` |
+|------------------|-------------------:|
+| ER Sparse Idx4 | −2.6 % |
+| Grid Idx4 | +8.2 % regression |
+| BA Idx4 | +6.3 % regression |
+| Path Idx4 | **−7.6 %** |
+| Path Default | +5.3 % |
+
+The `/Ob3` regressions on Grid and BA come from icache pressure: the
+inlined `sift_down_` body expands the run-lambda enough to hurt
+code-layout-sensitive workloads.
+
+**Outcome:** `/Ob3` is **not** committed as the default MSVC release
+flag. The build presets remain at `/O2 /Ob2 /DNDEBUG`. The
+`GRAPH_DETAIL_FORCE_INLINE` macro stays in `indexed_dary_heap.hpp`
+(harmless under GCC where it is `[[gnu::always_inline]]`); the
+`sift_down_` annotation was reverted because it provides no GCC benefit
+and only matters under `/Ob3` on MSVC, which we do not enable.
+
+Users who care specifically about the MSVC Path case can build with
+`/Ob3` themselves; the public defaults optimise for the common case.
+
+---
+
+## 7. Open follow-ups (out of scope for the heap parameter)
+
+These are tracked in [csr_edge_value_perf_plan.md](csr_edge_value_perf_plan.md):
+
+- The remaining ~5–10 % gap to BGL CSR on dense workloads is in graph-v3's
+  edge-value access path (`edge_value(g, uv)` on `compressed_graph`), not
+  in the heap. Phase 4.3b confirmed the gap is work-bound (does not grow
+  with `n` across L2→L3 transition), ruling out memory-bound suspects.
+- HW-counter profiling (`perf stat -e cycles,instructions,…`) is blocked
+  on WSL2 PMC support and is deferred. VTune `uarch-exploration` on
+  Windows is similarly deferred pending SEP driver / admin elevation.
+
+No further heap changes are planned. The `Heap` template parameter
+shipped, with `use_default_heap` as the default and
+`use_indexed_dary_heap<4>` as the documented opt-in for dense workloads.
diff --git a/agents/dary_heap/indexed_dary_heap_baseline.md b/agents/dary_heap/indexed_dary_heap_baseline.md
new file mode 100644
index 0000000..e2ebbf4
--- /dev/null
+++ b/agents/dary_heap/indexed_dary_heap_baseline.md
@@ -0,0 +1,65 @@
+# Dijkstra Baseline Benchmarks (Phase 0.4)
+
+Captured: 2026-04-25  
+Branch: `indexed-dary-heap` (heap implementation: `std::priority_queue`, lazy-deletion)  
+Binary: `build/linux-gcc-release/benchmark/algorithms/benchmark_dijkstra`  
+Flags: `--benchmark_min_time=1s`
+
+## Machine
+
+| Property | Value |
+|----------|-------|
+| Host | Titania |
+| CPUs | 20 × 3609.6 MHz |
+| L1-D | 48 KiB × 10 |
+| L2 | 1280 KiB × 10 |
+| L3 | 25600 KiB × 1 |
+| OS | Linux |
+
+## Results
+
+All times are wall-clock nanoseconds per Dijkstra call.  
+Construction and distance-vector reset are excluded from the timed region.
+
+### CSR (`compressed_graph`) — primary container
+
+| Benchmark | 1K ns | 10K ns | 100K ns | Complexity |
+|-----------|------:|-------:|--------:|------------|
+| ER Sparse (E/V≈8) | 61 016 | 1 362 706 | 29 086 330 | O(N log N) |
+| Grid 2D (E/V≈4) | 24 991 | 525 412 | 6 706 910 | O(N log N) |
+| Barabási–Albert m=4 (E/V≈8) | 58 157 | 1 338 566 | 25 402 054 | O(N log N) |
+| Path (E/V=1) | 2 991 | 28 017 | 275 799 | O(N) |
+
+### VoV (`dynamic_graph`) — secondary container
+
+| Benchmark | 1K ns | 10K ns | 100K ns | Complexity |
+|-----------|------:|-------:|--------:|------------|
+| ER Sparse (E/V≈8) | 56 416 | 1 396 716 | 32 867 125 | O(N²) † |
+| Grid 2D (E/V≈4) | 25 512 | 511 499 | 8 587 763 | O(N log N) |
+| Barabási–Albert m=4 (E/V≈8) | 52 471 | 1 402 162 | 32 211 598 | O(N²) † |
+| Path (E/V=1) | 4 599 | 43 635 | 440 975 | O(N) |
+
+† Google Benchmark fit `O(N²)` for VoV ER/BA at the measured scale; the
+  true complexity is O((V + E) log V) — this is likely a fitting artefact
+  from only three data points at high constant factors.  Treat as O(N log N)
+  for interpretation purposes.
+
+## Key Observations
+
+| Observation | Detail |
+|-------------|--------|
+| CSR ≈ VoV at small scale (1K–10K) | Traversal cost is not yet dominant |
+| CSR outperforms VoV at 100K | 29 ms vs 33 ms (ER), 6.7 ms vs 8.6 ms (Grid) |
+| BA ≈ ER on CSR | Both E/V≈8, similar times as expected |
+| Path is dramatically cheaper | 276 µs vs 29 ms at 100K — confirms lazy-deletion overhead with large heaps |
+| CSR Path ≈ 2.76N | Sub-logarithmic: path graph → at most n pushes, heap stays tiny |
+| VoV Path ≈ 4.41N | Consistent ~60% overhead vs CSR across all scales |
+
+## What to Beat in Phase 4
+
+After the indexed d-ary heap is integrated, every CSR row should improve.
+The path graph is the hardest to beat (heap barely fills) and the ER/BA
+graphs are the easiest to win on (O(E) heap pops with lazy-deletion vs
+O(V) with decrease-key).
+
+Target: CSR ER Sparse 100K ≤ **22 ms** (−25% vs 29 ms baseline).
diff --git a/agents/dary_heap/indexed_dary_heap_baseline_msvc.md b/agents/dary_heap/indexed_dary_heap_baseline_msvc.md
new file mode 100644
index 0000000..e788308
--- /dev/null
+++ b/agents/dary_heap/indexed_dary_heap_baseline_msvc.md
@@ -0,0 +1,232 @@
+# Dijkstra MSVC Baseline Benchmarks
+
+Captured: 2026-04-26
+Branch: `indexed-dary-heap` (HEAD `281fc7a`, working tree clean)
+Binary: `build/windows-msvc-release/benchmark/algorithms/benchmark_dijkstra.exe`
+Toolchain: MSVC 19.50.35729 (Visual Studio 18.5.1, host x64, target x64)
+Build flags: default `windows-msvc-release` preset (`/O2 /Ob2 /DNDEBUG`)
+Benchmark flags: `--benchmark_min_time=1s --benchmark_repetitions=5
+--benchmark_report_aggregates_only=true`
+Process pinning: single physical core (affinity mask `0x1`), priority class
+`High`. Run was split into four per-topology batches to stay inside the
+session's command-tool timeout; each batch is independent.
+
+## Machine
+
+| Property | Value |
+|----------|-------|
+| Host | Titania (same as Linux baseline / Phase 4.x results) |
+| CPUs | 20 × 3610 MHz |
+| L1-D | 48 KiB × 10 |
+| L1-I | 32 KiB × 10 |
+| L2 | 1280 KiB × 10 |
+| L3 | 25600 KiB × 1 |
+| OS | Windows |
+
+The hardware exactly matches the Linux Phase 0 baseline and Phase 4.x
+comparative runs, so any differences vs. those numbers reflect the
+toolchain (MSVC vs. GCC) and the C++ standard library (MSVC STL vs.
+libstdc++), not the machine.
+
+## Methodology notes (vs. the Linux baseline)
+
+| Concern | Linux baseline | MSVC baseline (this file) |
+|---------|----------------|---------------------------|
+| Frequency scaling | `cpupower frequency-set -g performance` | Process priority `High`; no governor knob on Windows. CV reported per row. |
+| Single-core pin | `taskset -c 4` | `Process.ProcessorAffinity = 0x1` |
+| min_time | `1s` (Phase 0 baseline) / `2s` (Phase 4.3a) | `1s` |
+| Repetitions | 1 (Phase 0) / 3 (Phase 4.x averages) | 5 (median + CV reported) |
+| Aggregation | mean of repetitions | median of 5 repetitions |
+
+## Heap variants compared
+
+| Tag | Description |
+|-----|-------------|
+| **Default** | `use_default_heap` — `std::priority_queue`, lazy deletion |
+| **Idx2** | `use_indexed_dary_heap<2>` — binary heap, true decrease-key |
+| **Idx4** | `use_indexed_dary_heap<4>` — 4-ary heap, true decrease-key |
+| **Idx8** | `use_indexed_dary_heap<8>` — 8-ary heap, true decrease-key |
+
+---
+
+## Results — CSR (`compressed_graph`)
+
+All times are wall-clock (real_time) nanoseconds per Dijkstra call,
+**median of 5 repetitions**. CV is the coefficient of variation (real_time)
+over those 5 repetitions; rows where CV exceeds 5 % are flagged with `†`.
+
+### Erdős–Rényi sparse, E/V ≈ 8
+
+| Heap | 1K ns | 10K ns | 100K ns | CV at 100K |
+|------|------:|-------:|--------:|-----------:|
+| Default | 69,406 | 1,288,481 | 26,655,249 | 1.58 % |
+| Idx2 | 47,768 | 1,171,749 | 26,422,283 | 4.50 % |
+| Idx4 | 54,294 | 1,063,524 | 21,124,883 | 4.63 % |
+| Idx8 | 70,041 | 1,290,562 | 22,223,590 | 0.49 % |
+
+### 2D Grid, E/V ≈ 4
+
+| Heap | 1K ns | 10K ns | 100K ns | CV at 100K |
+|------|------:|-------:|--------:|-----------:|
+| Default | 24,939 | 510,742 | 6,190,532 | 1.16 % |
+| Idx2 | 21,970 | 536,500 | 6,927,086 | 1.15 % |
+| Idx4 | 27,455 | 544,960 | 6,873,101 | 0.52 % |
+| Idx8 | 32,708 | 673,041 | 8,246,247 | 1.26 % |
+
+### Barabási–Albert, m=4, E/V ≈ 8
+
+| Heap | 1K ns | 10K ns | 100K ns | CV at 100K |
+|------|------:|-------:|--------:|-----------:|
+| Default | 70,715 | 1,279,933 | 25,268,386 | 0.82 % |
+| Idx2 | 43,928 | 1,133,997 | 25,353,973 | 1.65 % |
+| Idx4 | 50,625 | 1,024,064 | 19,603,770 | 0.80 % |
+| Idx8 | 67,539 | 1,244,579 | 22,194,943 | 8.92 % † |
+
+### Path, E/V = 1
+
+| Heap | 1K ns | 10K ns | 100K ns | CV at 100K |
+|------|------:|-------:|--------:|-----------:|
+| Default | 13,241 | 132,771 | 1,331,743 | 0.27 % |
+| Idx2 | 4,568 | 43,392 | 485,695 | 1.80 % |
+| Idx4 | 4,510 | 43,019 | 498,438 | 1.77 % |
+| Idx8 | 4,446 | 42,962 | 491,302 | 0.66 % |
+
+`†` BA Idx8 at 100K had a single high-variance run (one outlier of the
+five repetitions). The mean and median are within 4 % of each other so the
+median number is robust; treat the absolute level as ±10 % until rerun.
+
+---
+
+## MSVC vs. Linux baseline — same machine, same hardware
+
+CSR 100K, median (MSVC) vs. Phase 0 baseline / Phase 4.1 results (Linux GCC),
+both wall-clock ns per call:
+
+| Topology | Heap | MSVC ns | GCC ns (ref) | MSVC ÷ GCC | Source for GCC |
+|----------|------|--------:|-------------:|-----------:|----------------|
+| ER Sparse | Default | 26,655,249 | 27,049,885 | 0.99× | results §4.1 ER |
+| ER Sparse | Idx4 | 21,124,883 | 25,756,981 | 0.82× | results §4.1 ER |
+| ER Sparse | Idx8 | 22,223,590 | 20,216,860 | 1.10× | results §4.1 ER |
+| Grid | Default | 6,190,532 | 6,026,301 | 1.03× | results §4.1 Grid |
+| Grid | Idx4 | 6,873,101 | 8,165,088 | 0.84× | results §4.1 Grid |
+| Grid | Idx8 | 8,246,247 | 8,400,126 | 0.98× | results §4.1 Grid |
+| BA | Default | 25,268,386 | 22,904,717 | 1.10× | results §4.1 BA |
+| BA | Idx4 | 19,603,770 | 19,998,964 | 0.98× | results §4.1 BA |
+| BA | Idx8 | 22,194,943 | 19,038,871 | 1.17× † | results §4.1 BA |
+| Path | Default | 1,331,743 | 268,708 | 4.96× ‼ | results §4.1 Path |
+| Path | Idx4 | 498,438 | 326,018 | 1.53× | results §4.1 Path |
+| Path | Idx8 | 491,302 | 327,820 | 1.50× | results §4.1 Path |
+
+`†` BA Idx8 100K MSVC has CV 8.9 %; the 1.17× ratio may shift on rerun.
+`‼` Path/Default shows the **largest divergence** between toolchains. With
+the indexed heap the ratio drops to ~1.5×, suggesting MSVC's
+`std::priority_queue` codegen is materially slower than libstdc++'s on this
+no-decrease-key workload — the new heap path is much closer to GCC parity.
+
+### Cross-topology relative ordering — does the Phase 4.x story hold under MSVC?
+
+| Topology | Best heap (Linux GCC, Phase 4.1) | Best heap (MSVC, this run) | Same? |
+|----------|----------------------------------|----------------------------|-------|
+| ER Sparse | Idx8 (−25 %) | **Idx4** (−21 % vs Default), Idx8 close (−17 %) | Idx4 / Idx8 swap; both clearly beat Default |
+| Grid | Default | **Default** (Idx2/Idx4 within +11 %; Idx8 +33 %) | ✅ |
+| BA | Idx8 (−17 %) | **Idx4** (−22 % vs Default), Idx8 noisy | Mostly ✅ — both indexed variants win |
+| Path | Default | **Default** for absolute time, indexed wins by 2.7× — see below | ⚠ swap |
+
+The Path case is now the **opposite** of the Linux story: under MSVC the
+indexed heap is **2.7× faster** than the default at 100K (491k ns vs
+1.33M ns), whereas under GCC the default was 22 % faster than the indexed
+heap. This is the headline MSVC-specific finding.
+
+## Recommendation update
+
+The Phase 4.2 default decision (`use_default_heap` is the public default)
+holds under MSVC:
+
+- It still wins or ties on Grid.
+- It loses badly on Path under MSVC (where it lost slightly to indexed under
+  GCC), but Path is a degenerate case (no decrease-key opportunity); the
+  indexed-heap recommendation already covers it.
+- On dense / scale-free workloads (ER, BA), Idx4 is now slightly better than
+  Idx8 under MSVC at n = 100 K — opposite of GCC. The Phase 4.2
+  recommendation of `use_indexed_dary_heap<8>` should be **softened to
+  "Idx4 or Idx8"** for the MSVC documentation, with a note that the
+  toolchain affects the optimum.
+
+These observations are **not strong enough to change any defaults or
+recommendations** — they are baseline numbers for the next phase
+(Phase 4.3b on Windows: VTune Microarchitecture Exploration of the relax
+loop). Their purpose is to anchor every later VTune number to a known-good
+point so we can tell "this VTune sample reflects a representative run".
+
+---
+
+## `/Ob3` results — Phase 4.3e (2026-04-27)
+
+Build: `windows-msvc-release` with `CMAKE_CXX_FLAGS_RELEASE=/O2 /Ob3 /DNDEBUG`  
+`indexed_dary_heap.hpp`: `sift_down_` annotated `GRAPH_DETAIL_FORCE_INLINE`  
+Same methodology: 5 reps, median, core 0, priority High.
+
+### ER Sparse, E/V ≈ 8
+
+| Heap | 1K ns | 10K ns | 100K ns | vs /Ob2 at 100K |
+|------|------:|-------:|--------:|----------------:|
+| Default | 77,657 | 1,336,662 | 26,533,738 | ≈0 % |
+| Idx2 | 54,785 | 1,171,990 | 26,154,649 | −1.0 % |
+| Idx4 | 50,190 | 1,020,621 | 20,572,087 | **−2.6 %** |
+| Idx8 | 80,134 | 1,248,033 | 23,112,681 | +4.0 % |
+
+### 2D Grid, E/V ≈ 4
+
+| Heap | 1K ns | 10K ns | 100K ns | vs /Ob2 at 100K |
+|------|------:|-------:|--------:|----------------:|
+| Default | 25,203 | 537,223 | 6,323,796 | +2.2 % |
+| Idx2 | 24,694 | 579,034 | 7,490,114 | +8.1 % |
+| Idx4 | 28,244 | 606,708 | 7,440,434 | +8.2 % |
+| Idx8 | 35,770 | 723,495 | 8,859,656 | +7.4 % |
+
+### Barabási–Albert, m=4, E/V ≈ 8
+
+| Heap | 1K ns | 10K ns | 100K ns | vs /Ob2 at 100K |
+|------|------:|-------:|--------:|----------------:|
+| Default | 91,214 | 1,422,637 | 27,633,036 | +9.3 % |
+| Idx2 | 60,420 | 1,209,414 | 26,769,593 | +5.6 % |
+| Idx4 | 54,289 | 1,068,178 | 20,839,074 | +6.3 % |
+| Idx8 | 87,263 | 1,348,705 | 23,320,973 | +5.1 % |
+
+### Path, E/V = 1
+
+| Heap | 1K ns | 10K ns | 100K ns | vs /Ob2 at 100K |
+|------|------:|-------:|--------:|----------------:|
+| Default | 14,059 | 138,226 | 1,401,957 | +5.3 % |
+| Idx2 | 4,555 | 44,408 | 463,958 | **−4.5 %** |
+| Idx4 | 4,829 | 44,297 | 460,474 | **−7.6 %** |
+| Idx8 | 4,700 | 44,029 | 461,246 | **−6.1 %** |
+
+### Summary: /Ob3 vs /Ob2 at 100K
+
+| Topology | Heap | /Ob2 ns | /Ob3 ns | Δ |
+|----------|------|--------:|--------:|---|
+| ER Sparse | Default | 26,655,249 | 26,533,738 | ≈0 % |
+| ER Sparse | Idx4 | 21,124,883 | 20,572,087 | −2.6 % |
+| Grid | Default | 6,190,532 | 6,323,796 | +2.2 % |
+| Grid | Idx4 | 6,873,101 | 7,440,434 | +8.2 % ⚠ |
+| BA | Default | 25,268,386 | 27,633,036 | +9.3 % ⚠ |
+| BA | Idx4 | 19,603,770 | 20,839,074 | +6.3 % ⚠ |
+| Path | Default | 1,331,743 | 1,401,957 | +5.3 % |
+| Path | Idx4 | 498,438 | 460,474 | **−7.6 %** ✅ |
+
+### Interpretation
+
+- **Path indexed heap wins** (−4.5 % to −7.6 %): this is the workload where
+  the VTune profile showed the most comparator-chain overhead — `/Ob3`
+  collapses it and delivers a measurable wall-clock improvement.
+- **Grid and BA show regressions (+6–9 %)**: the inlined `sift_down_` body
+  expands the run-lambda significantly on these topologies (larger working
+  set, more icache pressure, different branch predictor behaviour). The
+  `/Ob2` code-layout was better for these cases.
+- **ER Sparse is essentially neutral** (within noise).
+- **Net verdict**: `/Ob3` + `GRAPH_DETAIL_FORCE_INLINE` on `sift_down_` is
+  **not a universal win**. It helps Path (the inlining-bottlenecked case)
+  but regresses Grid/BA (icache-sensitive cases). Reverting `sift_down_`
+  annotation and keeping `/Ob3` only for the flag-level benefit (without
+  force-inline on the loop body) is the next thing to try.
diff --git a/agents/dary_heap/indexed_dary_heap_plan.md b/agents/dary_heap/indexed_dary_heap_plan.md
new file mode 100644
index 0000000..094980e
--- /dev/null
+++ b/agents/dary_heap/indexed_dary_heap_plan.md
@@ -0,0 +1,601 @@
+# Indexed d-ary Heap for Dijkstra & Prim — Plan
+
+This plan introduces a true decrease-key priority queue to replace the
+`std::priority_queue` lazy-deletion pattern currently used by Dijkstra
+(and likely useful for Prim's MST). The goal is to remove stale-pop
+overhead, reduce heap memory from O(E) to O(V), and bring visitor
+semantics in line with BGL.
+
+**Branch:** `indexed-dary-heap`
+
+**Invariant:** After every phase, `ctest` passes all existing tests. No
+phase may break the public API of `dijkstra_shortest_paths`,
+`dijkstra_shortest_distances`, or any algorithm that already uses
+`std::priority_queue` internally.
+
+---
+
+## Conventions
+
+| Symbol | Meaning |
+|--------|---------|
+| **File** | Absolute path relative to repo root |
+| **Read** | Files the agent must read for context before editing |
+| **Create** | New files to create |
+| **Modify** | Existing files to edit |
+| **Verify** | Commands to run and expected outcomes |
+| **Commit** | Git commit message (conventional-commit style) |
+
+---
+
+## Background
+
+### Current state
+
+`dijkstra_shortest_paths` uses `std::priority_queue<weighted_vertex>`
+with re-insertion when a vertex's distance improves. The recently
+added stale-pop skip:
+
+```cpp
+if (compare(distance(g, uid), w)) continue;
+```
+
+makes this correct and gives single-shot visitor semantics, but the
+heap can hold up to O(E) entries and every relaxed edge causes a
+push.
+
+### Target state
+
+A min-heap that:
+
+- Stores at most one entry per vertex (size ≤ V).
+- Supports `push`, `top`, `pop`, `decrease(vid)`, `contains(vid)`.
+- Looks up a vertex's current distance via the user-supplied
+  `DistanceFn` (so heap order tracks live distance).
+- Is parameterized on arity `d` (default `d = 4`, matching Boost's
+  `d_ary_heap_indirect`).
+- Uses an external position map (`vertex_id -> heap_index`) so that
+  `decrease` is O(log_d V).
+
+### Performance hypothesis
+
+| Workload | Expected change vs. current |
+|----------|-----------------------------|
+| Sparse graph, few re-relaxations | Small win (push count drops, log V vs log E) |
+| Dense graph, many re-relaxations | Large win (heap size O(V) vs O(E)) |
+| Mapped (associative) vertex containers | Win depends on position-map cost |
+
+Hypothesis must be confirmed by benchmarks (Phase 4) before declaring
+the new heap the default.
+
+---
+
+## Phase 0 — Preparation (no code changes)
+
+### 0.1 Verify Baseline
+
+| Item | Detail |
+|------|--------|
+| **Action** | Confirm the full test suite is green on the branch base. |
+| **Verify** | `cd build/linux-gcc-debug && ctest --output-on-failure` — all tests pass |
+
+### 0.2 Benchmark Fixtures
+
+The fixtures must isolate three orthogonal axes: **scale**, **topology**,
+and **weight distribution**. Decrease-key matters most where many edges
+trigger relaxation, so topology selection is more important than raw size.
+
+#### Synthetic generators (primary — cheap, reproducible, deterministic with seed)
+
+| Generator | Purpose | Why it matters for d-ary heap |
+|-----------|---------|-------------------------------|
+| **Erdős–Rényi G(n, p)** | Random sparse/dense baseline | Tunable E/V ratio; "average case" |
+| **2D grid** | Spatial structure | Many short paths converge → moderate re-relaxation |
+| **Barabási–Albert (power-law)** | Hub-heavy | High-degree hubs cause heavy decrease-key traffic — where indexed heap should win biggest |
+| **Watts–Strogatz small-world** | Realistic mixed | Local + long-range edges; intermediate case |
+| **Path / cycle / complete** | Edge cases | Sanity bounds (no decreases vs. all decreases) |
+
+**Scale sweep:** V ∈ {10³, 10⁴, 10⁵, 10⁶} (and 10⁷ where memory permits).
+**Density sweep:** E/V ∈ {2, 8, 32} for Erdős–Rényi.
+**Weight distributions:** uniform random (default), exponential (drives more
+decrease-key calls due to heavy left tail), constant 1 (BFS-equivalent floor).
+
+#### Graph container types
+
+Use **`compressed_graph`** as the primary benchmark container. Its CSR
+layout (contiguous edge storage, no per-vertex indirection) minimizes
+graph-traversal overhead so that heap operation cost is a larger fraction
+of total runtime — making differences between heap implementations easier
+to measure. Dijkstra never modifies the graph, so the read-only restriction
+is not a concern.
+
+Also include **`dynamic_graph` (vov-backed)** in a secondary sweep to
+confirm that wins on `compressed_graph` hold under a realistic dynamic
+container and to serve as a regression baseline for typical user code.
+
+Do not benchmark mapped containers here — that is covered in Phase 3.
+
+#### Real-world graphs (validation — pick 2 to confirm synthetic conclusions)
+
+| Source | Suggested graph | Why |
+|--------|-----------------|-----|
+| **SNAP** | `roadNet-CA` (1.9M V, 5.5M E) | Classic Dijkstra benchmark; spatial / planar |
+| **SNAP** | `web-Google` (875K V, 5M E) | Web-link topology; mixed degree distribution |
+
+DIMACS USA road network (24M V) and Graph500 RMAT are deferred — only
+worth the infrastructure if Phase 4 results are ambiguous.
+
+#### Benchmark protocol
+
+| Concern | Approach |
+|---------|----------|
+| Generation cost | Build the graph in `SetUp()` / `state.PauseTiming()`, run Dijkstra in the timed region |
+| Source variance | Run from N random sources per graph (e.g., N=8) and average |
+| Metric beyond time | Report **relaxation count** alongside time — distinguishes heap-implementation wins from visitor-semantics bugs |
+| CPU stability | Disable frequency scaling (`cpupower frequency-set -g performance`) to meet CV < 5% target |
+| Caching of fixtures | Real-world graphs cached under `benchmark/data/` (gitignored); document fetch URLs in a README |
+
+| Item | Detail |
+|------|--------|
+| **Create** | `benchmark/algorithms/dijkstra_fixtures.hpp` (generators + loaders) |
+| **Create** | `benchmark/data/README.md` (fetch instructions for SNAP graphs) |
+| **Verify** | Each fixture produces a deterministic graph for a given seed; relaxation counts are stable across runs. |
+
+### 0.3 Add Dijkstra Benchmark (if missing)
+
+| Item | Detail |
+|------|--------|
+| **Action** | Ensure a Google Benchmark target exercises Dijkstra over the fixtures defined in 0.2. |
+| **Create** | `benchmark/algorithms/benchmark_dijkstra.cpp` if not present |
+| **Verify** | Benchmark builds and produces stable numbers across runs (CV < 5%). |
+
+### 0.4 Capture Baseline Benchmarks
+
+| Item | Detail |
+|------|--------|
+| **Action** | Record current Dijkstra benchmark numbers before any heap changes. |
+| **Read** | `benchmark/algorithms/benchmark_dijkstra.cpp` |
+| **Verify** | Save numbers to `agents/indexed_dary_heap_baseline.md` (committed as reference). |
+
+---
+
+## Phase 1 — Indexed d-ary Heap Container
+
+### 1.1 Design Header
+
+| Item | Detail |
+|------|--------|
+| **Read** | `boost/libs/graph/include/boost/graph/detail/d_ary_heap.hpp` for reference |
+| **Create** | `include/graph/detail/indexed_dary_heap.hpp` |
+
+Sketch of the public interface:
+
+```cpp
+namespace graph::detail {
+
+// External-key, indirect-comparison d-ary heap.
+//
+// Key      : the user's vertex id type (must be usable as an index/lookup key)
+// DistanceFn: callable (key) -> Distance&  (or const Distance&)
+// Compare  : strict weak order over Distance values (min-heap if less<>)
+// PositionMap: random-access mapping key -> size_t (heap position) or NPOS
+// Arity    : children per node (default 4)
+template <
+    class Key,
+    class DistanceFn,
+    class Compare,
+    class PositionMap,
+    std::size_t Arity = 4,
+    class Allocator = std::allocator<Key>>
+class indexed_dary_heap {
+public:
+    static constexpr std::size_t npos = static_cast<std::size_t>(-1);
+
+    indexed_dary_heap(DistanceFn d, Compare c, PositionMap p, const Allocator& = {});
+
+    bool   empty() const noexcept;
+    size_t size()  const noexcept;
+
+    void   push(Key k);              // O(log_d N)
+    Key    top()   const;            // O(1)
+    void   pop();                    // O(d log_d N)
+    void   decrease(Key k);          // O(log_d N) — distance must already be lower
+    bool   contains(Key k) const;    // O(1)
+    void   clear();
+
+private:
+    std::vector<Key, Allocator> heap_;
+    DistanceFn   distance_;
+    Compare      compare_;
+    PositionMap  position_;          // heap stores positions back into here on every move
+
+    void sift_up_(size_t i);
+    void sift_down_(size_t i);
+    void place_(size_t i, Key k);    // writes heap_[i] = k AND position_[k] = i
+};
+
+} // namespace graph::detail
+```
+
+Notes:
+- `PositionMap` is a *concept-style* requirement: `size_t& operator()(Key)` or
+  similar. For index-based graphs, it can wrap a `std::vector<size_t>`. For
+  mapped graphs, it can wrap an `std::unordered_map`. Decision deferred to
+  1.3.
+- `DistanceFn` is the *same* function the user passes to
+  `dijkstra_shortest_paths`. The heap reads, never writes.
+- Comparator is `Compare`, applied to *distances* (not keys). Internally:
+  `compare_(distance_(a), distance_(b))`.
+
+### 1.2 Implement Core Operations
+
+| Item | Detail |
+|------|--------|
+| **Action** | Implement `push`, `pop`, `sift_up_`, `sift_down_`, `decrease`, `contains`, `clear`. Keep `place_` as the single point where positions are written, to avoid bookkeeping bugs. |
+| **Verify** | Unit-tests in 1.4 pass. |
+
+Key correctness rules:
+- Every assignment to `heap_[i]` must go through `place_` so `position_` stays in sync.
+- `decrease(k)` reads `position_(k)` then sifts up only — caller guarantees the new distance is no worse.
+- `pop()` swaps last → root, marks the popped key's position as `npos`, then sifts down.
+
+### 1.3 Position Map Adapter
+
+| Item | Detail |
+|------|--------|
+| **Create** | `include/graph/detail/heap_position_map.hpp` |
+| **Action** | Provide two adapters: <br> 1. `vector_position_map` — wraps a `std::vector<size_t>` indexed by integral key. <br> 2. `assoc_position_map` — wraps `std::unordered_map<Key, size_t>` for non-integral keys. <br> Both default-construct to `npos` semantics. |
+| **Verify** | Adapters compile with the heap. Covered by tests in 1.4. |
+
+### 1.4 Unit Tests
+
+| Item | Detail |
+|------|--------|
+| **Create** | `tests/common/test_indexed_dary_heap.cpp` |
+| **Action** | Cover: empty heap, single element, ascending/descending pushes, mixed push+pop, repeated `decrease`, `contains` before/after push/pop, both arity 2 and 4, custom comparator (max-heap), both position-map adapters. |
+| **Verify** | `ctest -R indexed_dary_heap` — all pass. |
+| **Commit** | `feat(detail): indexed d-ary heap with external position map` |
+
+---
+
+## Phase 2 — Integrate into Dijkstra (opt-in)
+
+### 2.1 Add Heap-Selector Tag (or Template Parameter)
+
+| Item | Detail |
+|------|--------|
+| **Read** | `include/graph/algorithm/dijkstra_shortest_paths.hpp` |
+| **Modify** | Add an optional template parameter `Heap = use_default_heap` (a tag). When `use_default_heap`, behavior is unchanged. When `use_indexed_dary_heap<Arity>`, the new heap is used. |
+| **Verify** | Existing tests still pass (default path unchanged). |
+| **Commit** | `feat(dijkstra): add heap-selector template parameter` |
+
+Rationale: keeps the change additive and reversible. We can flip the
+default in a later phase once benchmarks confirm parity or improvement.
+
+### 2.2 Implementation Branch
+
+| Item | Detail |
+|------|--------|
+| **Modify** | Inside `dijkstra_shortest_paths`, dispatch to one of two inner implementations based on the `Heap` tag. Share the visitor / relax / source-seeding code via a small helper. |
+| **Action** | The indexed-heap implementation: <br> - Removes the stale-pop skip (no stale entries possible). <br> - Replaces re-push with `decrease` on the relax path. <br> - Removes `weighted_vertex` (heap stores ids only; distance is read live via `DistanceFn`). |
+| **Verify** | All existing Dijkstra tests pass under both code paths. Add a test variant that exercises each test with the indexed heap. |
+| **Commit** | `feat(dijkstra): indexed d-ary heap implementation path` |
+
+### 2.3 Visitor Semantics Audit
+
+| Item | Detail |
+|------|--------|
+| **Action** | Confirm `on_examine_vertex` and `on_finish_vertex` fire exactly once per reachable vertex on the indexed-heap path. Confirm `on_edge_relaxed` and `on_edge_not_relaxed` counts match Boost's behavior. |
+| **Verify** | Add a counting-visitor test that asserts call counts on a reference graph with both heap paths. |
+| **Commit** | `test(dijkstra): visitor call-count parity across heap paths` |
+
+---
+
+## Phase 3 — Mapped-Container Support
+
+### 3.1 Position Map for Mapped Graphs
+
+| Item | Detail |
+|------|--------|
+| **Read** | `agents/map_container_strategy.md`, `agents/map_container_plan.md` |
+| **Action** | Wire the `assoc_position_map` adapter into the indexed-heap dispatch when `vertex_id_t<G>` is non-integral or the graph is a mapped container. Decision criterion to be documented. |
+| **Verify** | Run the Dijkstra test suite against mapped graph types with the indexed heap. |
+| **Commit** | `feat(dijkstra): indexed-heap support for mapped containers` |
+
+### 3.2 Vertex-Property-Map Position Storage (optional)
+
+| Item | Detail |
+|------|--------|
+| **Action** | Investigate whether the position map can live inside the graph as a vertex property map (matching Boost's `vertex_property_map_generator`). Spike only — implement only if it removes a meaningful allocation on hot paths. |
+| **Verify** | Benchmark before/after on mapped graphs. |
+| **Commit** | `feat(dijkstra): in-graph position map for mapped containers` (only if accepted) |
+
+---
+
+## Phase 4 — Benchmarks & Default Selection
+
+### 4.1 Comparative Benchmarks ✅
+
+| Item | Detail |
+|------|--------|
+| **Action** | Run the Phase 0.3 benchmarks against (a) `priority_queue` path, (b) `indexed_dary_heap<2>`, (c) `indexed_dary_heap<4>`, (d) `indexed_dary_heap<8>`. Record results in `agents/indexed_dary_heap_results.md`. |
+| **Verify** | Numbers stable across at least 3 runs. |
+| **Status** | Completed in commit `fac4085`. Full numbers in `agents/indexed_dary_heap_results.md`. |
+
+### 4.2 Decide Default ✅
+
+| Item | Detail |
+|------|--------|
+| **Action** | Based on results: <br> - If indexed `d=4` wins or ties on every workload, make it the default. <br> - If it loses on sparse small graphs, keep `priority_queue` default and document the selector. <br> - If results are mixed, consider a heuristic dispatch (e.g., based on E/V ratio) — but only with strong evidence. |
+| **Modify** | Default heap parameter, plus a CHANGELOG entry. |
+| **Verify** | Full test suite still green. Benchmarks regenerated. |
+| **Commit** | `perf(dijkstra): switch default heap to indexed d-ary` (or document why not) |
+| **Decision** | **Keep `use_default_heap` as the default.** Phase 4.1 results are mixed: indexed wins by 17–25% on high-E/V workloads (ER, BA) but loses by 22–39% on grid (E/V≈4) and path (E/V=1). The grid regression is too large to justify a universal switch, and a heuristic E/V dispatch was considered but rejected as premature (one workload axis is not strong enough evidence; users with known graph shapes can opt in explicitly). Documented `use_indexed_dary_heap<8>` as the recommended opt-in for high-E/V random / scale-free workloads on `compressed_graph` in the heap-tag doc comments and CHANGELOG. |
+| **Status** | Completed. Default unchanged. CHANGELOG entry added; `use_default_heap` and `use_indexed_dary_heap` doc comments now record the recommendation. |
+
+### 4.3 BGL Comparison Benchmarks (optional validation) ✅
+
+Run after 4.2 as a "how do we compare to BGL" sanity check, not as a
+gating criterion for the default decision.
+
+#### Setup
+
+BGL is already available at `/home/phil/dev_graph/boost/`. Since it is
+header-only, add an `include_directories` entry in
+`benchmark/CMakeLists.txt` — no linking required.
+
+Create `benchmark/algorithms/bgl_dijkstra_fixtures.hpp` as a companion to
+`dijkstra_fixtures.hpp`. It builds BGL equivalents from the same edge-list
+generators so that both libraries operate on topologically identical graphs:
+
+| graph-v3 container | BGL equivalent |
+|--------------------|----------------|
+| `compressed_graph` | `compressed_sparse_row_graph<directedS, …, EdgeWeight>` |
+| `dynamic_graph` (vov) | `adjacency_list<vecS, vecS, directedS, …, EdgeWeight>` |
+
+#### Invocation wrapper
+
+BGL Dijkstra uses property maps; a thin (~20-line) wrapper per container
+type handles construction and invocation:
+
+```cpp
+// Use the no_init + no_color_map variant to match graph-v3's semantics:
+// caller pre-initialises distances; no color map allocation.
+boost::dijkstra_shortest_paths_no_color_map_no_init(
+    bg, source,
+    boost::predecessor_map(boost::make_iterator_property_map(pred.begin(), get(boost::vertex_index, bg)))
+        .distance_map(boost::make_iterator_property_map(dist.begin(), get(boost::vertex_index, bg)))
+        .weight_map(get(&EdgeProp::weight, bg)));
+```
+
+#### Fairness rules
+
+| Concern | Rule |
+|---------|------|
+| **Init cost** | Use `_no_init` for BGL and pre-initialise distances before the timed region for both libraries — so neither pays init cost inside the timer. |
+| **Compiler flags** | Both compiled with `-O3 -march=native`; confirm BGL headers are not accidentally included from a debug install. |
+| **Heap difference** | BGL uses a d-ary heap (d=4) with decrease-key internally. Before Phase 4 graph-v3 will likely lose on dense graphs; after Phase 4 expect rough parity. Document this expectation in the results. |
+| **Property-map overhead** | BGL's extra indirection layer may give graph-v3 a small constant advantage even at heap parity. Note it in the analysis. |
+
+| Item | Detail |
+|------|--------|
+| **Create** | `benchmark/algorithms/bgl_dijkstra_fixtures.hpp` |
+| **Modify** | `benchmark/algorithms/benchmark_dijkstra.cpp` — add BGL variants alongside existing benchmarks |
+| **Verify** | BGL and graph-v3 produce identical distance arrays on the same graph + source (add a correctness assert outside the timed loop). |
+| **Verify** | Results recorded in `agents/indexed_dary_heap_results.md` alongside 4.1 numbers. |
+| **Commit** | `bench(dijkstra): add BGL comparison benchmarks` |
+| **Status** | Completed. CSR 100K results: BGL CSR is 10–15% faster than graph-v3 Idx8 on dense (ER/BA); graph-v3 ties/beats BGL CSR on low-E/V (grid/path); graph-v3 default beats BGL `adjacency_list` on every topology by 23–48%. Decision: no further heap changes; remaining dense-CSR gap is in CSR layout, not the heap. Full numbers in `agents/indexed_dary_heap_results.md` § Phase 4.3. |
+
+---
+
+## Phase 5 — Reuse for Prim's MST ✅
+
+### 5.1 Audit Prim's Implementation ✅
+
+| Item | Detail |
+|------|--------|
+| **Read** | `include/graph/algorithm/mst.hpp` |
+| **Action** | Identify whether Prim has the same lazy-deletion pattern. |
+| **Finding** | `prim()` is a thin wrapper over `dijkstra_shortest_paths` with a custom `prim_combine` lambda that returns `w_uv` instead of `d_u + w_uv`. There is no separate priority queue and no lazy-deletion code in `mst.hpp`. The intent was that Prim would inherit any heap improvements made to Dijkstra "for free". |
+
+### 5.2 Apply Indexed Heap to Prim ✅ Implemented via Option 1
+
+**Status:** Completed. The implementation attempt surfaced a pre-existing
+latent Prim correctness bug (described under "Root cause" below). Phase 5
+landed Option 1 of the two fixes considered in 5.2; Option 2 is documented
+below as a future optimization.
+
+#### Symptom that exposed the bug
+
+A new test `prim - indexed d-ary heap parity` over an 8-vertex weighted
+undirected graph (correct MST weight = 18, verified by Kruskal):
+
+- Default-heap `prim()` returned total weight **13** (wrong; corrupts
+  `weight[]` after vertex finalization).
+- Indexed-heap `prim()` aborted with `vector::operator[](npos)` from
+  `indexed_dary_heap::sift_up_` because `decrease(v)` was invoked on a
+  vertex `v` that had already been popped (position == `npos`).
+
+#### Root cause
+
+`dijkstra_shortest_paths` relies on the Dijkstra invariant: with non-negative
+weights and `combine = plus`, distance is monotonic, so a finalized vertex
+can never be relaxed again. The relax step therefore omits a "skip if
+finalized" guard.
+
+Prim's combine `(d_u, w_uv) -> w_uv` breaks that invariant: the priority of
+a vertex is just the cheapest currently-known incident edge, which is **not**
+monotonic in the order vertices are popped. After `v` is finalized with
+`weight[v] = w_uv`, a later-popped neighbor `y` may present an edge `y → v`
+with `w_yv < weight[v]`. The relax succeeds, corrupts `weight[v]` (which is
+the *output* MST tree-edge weight), and then:
+
+- Default heap: re-pushes `v`; the stale-pop check
+  `compare(distance, w) = compare(w_yv, w_yv) = false`, so `v` is examined a
+  second time and its outgoing edges are re-relaxed. Garbage MST weight.
+- Indexed heap: calls `decrease(v)` on a finalized `v` whose position is
+  `npos` → out-of-bounds vector access → SIGABRT.
+
+The existing trivial Prim tests (triangles, 4-vertex paths, single-cluster
+sparse graphs) never trigger the post-finalization re-relax case, so the
+bug had been latent.
+
+#### Fix applied — Option 1 (guard inside `prim()` only)
+
+Wrap `weight_fn` so it returns `+infinity` for any vertex already in a
+`finalized` set maintained by a Prim-specific visitor's `on_finish_vertex`.
+The wrapped weight makes the relax test
+`compare(combine(d_u, w_uv), weight[v])` evaluate to
+`compare(infinity, weight[v]) = false`, suppressing both the corrupting
+update and the spurious `decrease`.
+
+Storage is dispatched on `adj_list::index_vertex_range<G>`:
+
+- Dense / contiguous-id graphs (e.g. `vector<vector<...>>`): `std::vector<bool>`
+  indexed by id. ~1 bit per vertex; one predictable branch and bit-load per
+  edge in the inner loop.
+- Sparse / mapped-id graphs: `std::unordered_set<vertex_id_t<G>>`. One hash
+  lookup per edge, used only when the dense path is not viable.
+
+The `Heap` template parameter is now exposed on `prim()`, so callers can
+opt into `use_indexed_dary_heap<D>{}` (Phase 4.2 recommendation for dense /
+scale-free workloads).
+
+#### Option 2 — standalone Prim (deferred)
+
+Reimplementing `prim()` as a first-class algorithm (no Dijkstra reuse) would
+remove the `combine`-lambda hack and the `distance[]` array that Dijkstra
+maintains shadow-style for Prim's use. Expected gains are roughly **5–10%**
+on dense graphs (one fewer `combine` call and one fewer indirection in the
+relax loop, no shadow distance writes), with no asymptotic change — both
+remain `O(E log V)`. Cost: a second algorithm body to maintain and to keep
+in lockstep with future Dijkstra heap work. Not pursued in Phase 5; revisit
+only if Prim becomes a measured bottleneck.
+
+#### Action
+
+| Item | Detail |
+|------|--------|
+| **Action** | Implemented Option 1 in `mst.hpp`; exposed `Heap` template parameter on `prim()` defaulting to `use_default_heap{}`. |
+| **Verify** | New regression test `prim - indexed d-ary heap parity` (test_mst.cpp): 8-vertex graph, MST = 18 cross-checked against Kruskal. Default heap, `use_indexed_dary_heap<4>{}`, and `use_indexed_dary_heap<8>{}` all return 18 with matching `weight[]` arrays. Full ctest: 4848/4848 pass. |
+| **Commit** | `fix(mst): correct Prim post-finalization re-relax + add heap selector (Phase 5)` |
+
+---
+
+## Open Questions
+
+1. ~~**PositionMap ownership.** Owned by the heap (simplest, allocates per call), or
+   passed in (zero-allocation for repeated calls, more API surface)? Default to
+   owned-by-heap for the first cut.~~ **Resolved:** PositionMap is owned by the heap.
+2. ~~**Arity as runtime vs compile-time.** Compile-time only — runtime would lose
+   the constexpr unrolling that justifies d-ary heaps in the first place.~~ **Resolved:**
+   Arity is a compile-time `std::size_t` template parameter. A runtime arity would
+   prevent the compiler from unrolling the inner child-comparison loop (the hot path
+   in `sift_down`), eliminating the main performance advantage of d-ary heaps over a
+   standard binary heap. Rationale is documented in `indexed_dary_heap.hpp`.
+3. ~~**`Compare` indirection cost.** The heap calls `compare_(distance_(a), distance_(b))`
+   twice per sift-down step (one comparator call per child + one against the parent).
+   For trivial `DistanceFn` (vector lookup) this should inline; verify in benchmarks.~~
+   **Resolved:** Disassembly of the release `benchmark_dijkstra` binary
+   (compressed_graph + `use_indexed_dary_heap<4>` + `std::less<double>` + `container_value_fn<vector<double>>`)
+   shows the `run` lambda — which contains the inlined `sift_up_`, `sift_down_`,
+   `push_or_decrease`, `pop`, and the Dijkstra relax loop — has **zero `call`
+   instructions to `compare_` or `distance_`** in the hot path.
+
+   Verified at both `-O3` and `-O2`:
+
+   | Metric (run lambda body, CSR + Idx4) | -O3 | -O2 |
+   |---|---:|---:|
+   | `call` instructions to `compare_` / `distance_` / `std::less` | 0 | 0 |
+   | `call`s present (all cold: vector grow / new / delete / memcpy) | 2 | 24 |
+   | `ucomisd` comparison sites | 8 | 8 |
+   | Out-of-line `sift_up_` / `sift_down_` / `std::less<double>::operator()` symbols in object | none | none |
+
+   At -O3 the only call-outs are to `std::vector::_M_realloc_append`; at -O2
+   the call-outs expand to direct `operator new` / `operator delete` /
+   `memcpy` (8 each, from the heap-vector growth path) but the comparison
+   sites themselves remain pure `ucomisd` against direct base+index*8 loads
+   from the distance buffer (e.g. `movsd (%r8,%r9,8),%xmm1`). GCC fully
+   inlines `heap_distfn` (the capturing lambda) → `container_value_fn::operator()`
+   → `vector::operator[]` to a raw `double*` indexed load and reduces
+   `std::less<double>::operator()` to a bare compare at both optimisation
+   levels. The functional-style `compare_(distance_(a), distance_(b))`
+   abstraction has zero runtime cost for trivial `DistanceFn` / `Compare`
+   types, which is the only configuration the benchmarks measure.
+4. ~~**Visitor `on_examine_vertex` semantics on multi-source seeding.** The current
+   multi-source code seeds N vertices into the queue. With the indexed heap, the
+   first pop of each source is the settled pop (no re-pushes possible since
+   distance is already 0). Confirm visitor semantics are unchanged.~~
+   **Resolved:** Confirmed by inspection and by a parity test
+   (`dijkstra(indexed_heap) - multi-source visitor parity vs default heap`,
+   `tests/algorithms/test_dijkstra_indexed_heap.cpp`). With non-negative
+   weights every source is pushed at distance 0 and is finalized on its
+   first pop (no later relax can lower distance below 0), so on every path
+   each vertex fires `on_discover_vertex`, `on_examine_vertex`, and
+   `on_finish_vertex` exactly once. The default heap achieves this via its
+   stale-pop skip at the top of the main loop (documented at
+   `dijkstra_shortest_paths.hpp` lines 343–355); the indexed heap achieves
+   it structurally because true decrease-key means the heap never contains
+   duplicates. Test verifies all four counters (`examine`, `finish`,
+   `discover`, `relaxed`, `not_relaxed`) agree byte-for-byte across
+   `use_default_heap{}`, `use_indexed_dary_heap<4>{}`, and
+   `use_indexed_dary_heap<8>{}` on the CLRS graph with sources `{0, 3}`.
+5. ~~**Should the new heap live in `graph/detail/` or be promoted to `graph/container/`?**
+   Defer the decision — start in `detail/` and promote only if external code finds it
+   useful.~~ **Resolved:** Keep `indexed_dary_heap` in `graph/detail/`. It is an
+   implementation detail of `dijkstra_shortest_paths` (and now `prim`), selected
+   internally via the `use_indexed_dary_heap<D>` heap-tag selector. Users do not
+   instantiate or name the heap type directly, so there is no benefit to exposing
+   it in the public `graph/container/` namespace. Revisit only if a future external
+   use case appears.
+6. ~~**Why is graph-v3 CSR slower than BGL CSR?** Phase 4.3 shows BGL's
+   `compressed_sparse_row_graph` is 10–15% faster than graph-v3 + Idx8 on dense
+   graphs (ER/BA). However the comparison is not arity-equivalent: BGL's
+   `d_ary_heap_indirect` is hard-coded to arity 4 (`d_ary_heap_indirect<Vertex, 4, ...>`
+   in `boost/graph/dijkstra_shortest_paths.hpp`), while the Phase 4.3 table used
+   graph-v3 Idx8. The benchmark already has an Idx4 variant; re-running the
+   comparison with `BM_Dijkstra_CSR_*_Idx4` vs `BM_Dijkstra_BGL_CSR_*` would give
+   an apples-to-apples heap comparison and may narrow or close the gap.
+   Remaining candidate causes if a gap persists: (a) BGL's
+   `get(&bgl_edge_prop::weight, g)` weight-map may compile to a raw pointer stride
+   with zero indirection, while graph-v3 edge values go through an `edge_value()`
+   accessor that may add a level of indirection or prevent auto-vectorisation;
+   (b) cache-line alignment differences in the CSR adjacency arrays.
+   Resolving this fully would require profiling (perf/vtune) and is out of scope
+   for the current plan.~~
+   **Resolved:** Re-ran on the same machine, n = 100 K, 3-run averages of CPU
+   time. Full table in `indexed_dary_heap_results.md` § "Phase 4.3a — Apples-to-apples
+   re-run with Idx4". Summary:
+
+   | Topology | Idx4 vs BGL CSR | Idx8 vs BGL CSR |
+   |---|---:|---:|
+   | ER Sparse | +7.7% | +5.9% |
+   | BA        | +9.4% | +4.6% |
+   | Grid      | +36.5% | +38.5% |
+   | Path      | +15.0% | +14.6% |
+
+   **Arity is not the bottleneck.** Switching to Idx4 (apples-to-apples with
+   BGL's hard-coded arity-4) does not narrow the gap on any topology — Idx4 and
+   Idx8 are within 1–3 percentage points of each other, and on BA Idx8 is
+   actually slightly faster (power-law graphs reward wider arity on high-degree
+   hubs). The gap is largest on Grid (~37%), which has the most predictable
+   heap-access pattern of any topology — if the heap were the bottleneck Grid
+   would have the *smallest* gap, not the largest. The remaining gap is in
+   the relax loop's edge-value access, not the heap. Most plausible cause is
+   BGL's `get(&edge_prop::weight, g)` resolving to a raw `Weight*` indexed by
+   edge offset, vs graph-v3's `edge_value(g, uv)` going through an iterator
+   `value()` accessor on `compressed_graph`'s edge-property storage. Confirming
+   and fixing this requires `perf stat` / `perf record` profiling on
+   `compressed_graph` and is out of scope for this plan. **No further heap
+   changes recommended.** For dense CSR workloads, prefer Idx8 over Idx4
+   (consistently 1–5 percentage points faster).
+
+---
+
+## Out of Scope
+
+- Fibonacci heap, pairing heap, or radix heap implementations.
+- Replacing other algorithms' priority queues (BFS variants, A*, etc.).
+- Changing public algorithm signatures beyond adding the optional `Heap` template
+  parameter.
+- Parallel / concurrent heap variants.
diff --git a/agents/dary_heap/indexed_dary_heap_results.md b/agents/dary_heap/indexed_dary_heap_results.md
new file mode 100644
index 0000000..5623738
--- /dev/null
+++ b/agents/dary_heap/indexed_dary_heap_results.md
@@ -0,0 +1,591 @@
+# Dijkstra Comparative Benchmarks — Phase 4.1
+
+Captured: 2026-04-25  
+Branch: `indexed-dary-heap`  
+Binary: `build/linux-gcc-release/benchmark/algorithms/benchmark_dijkstra`  
+Flags: `--benchmark_min_time=1s`  
+Runs: 3 (averages reported; CV flags entries with coefficient of variation > 5%)
+
+## Machine
+
+| Property | Value |
+|----------|-------|
+| Host | Titania |
+| CPUs | 20 × 3609.6 MHz |
+| L1-D | 48 KiB × 10 |
+| L2 | 1280 KiB × 10 |
+| L3 | 25600 KiB × 1 |
+| OS | Linux |
+
+## Heap variants compared
+
+| Tag | Description |
+|-----|-------------|
+| **Default** | `use_default_heap` — `std::priority_queue`, lazy deletion |
+| **Idx2** | `use_indexed_dary_heap<2>` — binary heap, true decrease-key |
+| **Idx4** | `use_indexed_dary_heap<4>` — 4-ary heap, true decrease-key |
+| **Idx8** | `use_indexed_dary_heap<8>` — 8-ary heap, true decrease-key |
+
+---
+
+## Results — CSR (`compressed_graph`) primary container
+
+All times are wall-clock nanoseconds per Dijkstra call (average of 3 runs).  
+`†` = CV > 5% (run-to-run graph variation; ER/BA are re-generated each run).  
+`↑` = improvement vs Default; `↓` = regression.
+
+### Erdős–Rényi, E/V ≈ 8
+
+| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) |
+|------|------:|-------:|--------:|:-----------------:|
+| Default | 58,360 | 1,272,919 | 27,049,885 † | — |
+| Idx2 | 46,190 | 1,089,690 | 24,233,457 † | **↑ −10%** |
+| Idx4 | 57,661 | 1,178,680 | 25,756,981 † | **↑ −5%** |
+| Idx8 | 57,465 | 1,171,452 | 20,216,860 | **↑ −25%** |
+
+> Note: CV is high for ER graphs (19–34% at 100K) because the topology is re-randomised
+> between runs. The intra-run RMS reported by Google Benchmark is < 5% for all variants.
+> **Idx8 at 100K reaches 20.2 ms, meeting the −25% target vs the Phase 0 baseline (29.1 ms).**
+
+### 2D Grid, E/V ≈ 4
+
+| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) |
+|------|------:|-------:|--------:|:-----------------:|
+| Default | 24,385 | 505,121 | 6,026,301 | — |
+| Idx2 | 18,664 | 509,521 | 6,671,405 | ↓ +11% |
+| Idx4 | 14,554 | 604,026 | 8,165,088 | ↓ +35% |
+| Idx8 | 15,033 | 609,548 | 8,400,126 | ↓ +39% |
+
+> Grid graphs have moderate re-relaxation but a low E/V ratio (≈4).
+> The indexed heap's position-map bookkeeping overhead outweighs the decrease-key benefit.
+> **Default heap wins on grid — indexed heap should not be the default for grid-like workloads.**
+
+### Barabási–Albert, m=4, E/V ≈ 8
+
+| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) |
+|------|------:|-------:|--------:|:-----------------:|
+| Default | 53,632 | 1,261,122 | 22,904,717 | — |
+| Idx2 | 42,732 | 1,062,473 | 22,125,874 | **↑ −3%** |
+| Idx4 | 55,970 | 1,140,516 | 19,998,964 | **↑ −13%** |
+| Idx8 | 54,138 | 1,135,116 | 19,038,871 | **↑ −17%** |
+
+> BA graphs have hub vertices with high degree → many decrease-key calls → indexed heap wins.
+> Idx8 provides the best result (−17%).
+
+### Path graph, E/V = 1
+
+| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) |
+|------|------:|-------:|--------:|:-----------------:|
+| Default | 2,885 | 27,188 | 268,708 | — |
+| Idx2 | 3,213 | 32,342 | 329,337 | ↓ +23% |
+| Idx4 | 3,299 | 32,729 | 326,018 | ↓ +21% |
+| Idx8 | 3,232 | 32,787 | 327,820 | ↓ +22% |
+
+> Path graph = zero decrease-key calls. The indexed heap has overhead (position-map writes
+> on every push) but never benefits. **Default heap wins on minimal-relaxation workloads.**
+
+---
+
+## Results — VoV (`dynamic_graph`) secondary container
+
+### Erdős–Rényi, E/V ≈ 8
+
+| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) |
+|------|------:|-------:|--------:|:-----------------:|
+| Default | 49,568 | 1,270,500 | 28,499,453 | — |
+| Idx4 | 58,856 | 1,239,416 | 27,529,959 | **↑ −3%** |
+
+### 2D Grid, E/V ≈ 4
+
+| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) |
+|------|------:|-------:|--------:|:-----------------:|
+| Default | 24,193 | 492,725 | 6,553,458 | — |
+| Idx4 | 13,909 | 643,017 | 9,346,129 | ↓ +43% |
+
+### Barabási–Albert, m=4, E/V ≈ 8
+
+| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) |
+|------|------:|-------:|--------:|:-----------------:|
+| Default | 47,737 | 1,315,464 | 26,706,329 † | — |
+| Idx4 | 57,718 | 1,237,012 | 25,880,471 | **↑ −3%** |
+
+### Path graph, E/V = 1
+
+| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) |
+|------|------:|-------:|--------:|:-----------------:|
+| Default | 4,539 | 43,331 | 433,332 | — |
+| Idx4 | 4,925 | 46,501 | 466,101 | ↓ +8% |
+
+---
+
+## Cross-topology summary (CSR, 100K vertices)
+
+| Topology | E/V | Best heap | Win vs Default | Note |
+|----------|----:|-----------|:--------------:|------|
+| ER Sparse | ≈8 | **Idx8** | −25% | High re-relaxation; meets target |
+| BA | ≈8 | **Idx8** | −17% | Hub vertices drive decrease-key |
+| Grid | ≈4 | **Default** | — (indexed +39%) | Low heap pressure; position-map overhead dominates |
+| Path | 1 | **Default** | — (indexed +22%) | No decrease-key benefit; pure overhead |
+
+## Key Observations
+
+| Observation | Detail |
+|-------------|--------|
+| **Idx8 wins on high E/V graphs** | ER Sparse −25%, BA −17% at 100K CSR. Higher arity → less sift-down cost per pop, better cache locality. |
+| **Default wins on low E/V graphs** | Grid (+39%), Path (+22%). Position-map bookkeeping dominates when decrease-key calls are rare. |
+| **Arity 8 > 4 > 2 on ER/BA** | Consistent ordering; higher arity worth the wider sift-down fan-out. |
+| **VoV gap is smaller** | VoV Idx4 is only marginally better on ER/BA (−3%) vs CSR Idx8 (−25%). Extra indirection through VoV reduces the heap's relative contribution. |
+| **Phase 0 target met on CSR ER Sparse** | Idx8 at 20.2 ms vs baseline 29.1 ms = **−31%** (target was −25%). |
+
+## Recommendation for Phase 4.2
+
+Results are **topology-dependent** — a single default cannot be optimal across all workloads:
+
+- **Do not change the default to indexed heap unconditionally.** The grid regression (+39%) is too severe.
+- **Document the selector pattern** for users who know their topology has high re-relaxation (dense random / BA-like graphs).
+- **Consider a heuristic**: if `E/V > threshold` (e.g., 6), auto-select Idx8; otherwise keep Default. Requires computing E/V at call time — adds overhead but could be compile-time detectable for CSR.
+- **Best documented choice**: `use_indexed_dary_heap<8>` for dense random/BA graphs on CSR; `use_default_heap` otherwise.
+
+---
+
+## Phase 4.3 — BGL Comparison
+
+Boost.Graph (header-only, version at `/home/phil/dev_graph/boost`) wired into
+the same benchmark harness. Both libraries operate on topologically identical
+graphs built from the same `edge_list` (see `bgl_dijkstra_fixtures.hpp`). BGL
+uses `dijkstra_shortest_paths_no_color_map_no_init` for fairness — caller
+pre-initialises distances; no color-map allocation inside the timed region.
+
+A startup parity check (`check_bgl_distance_parity`) asserts that BGL and
+graph-v3 produce identical distance vectors for ER, BA, and Path graphs at
+n=1024 from source 0. Benchmarks abort if parity fails.
+
+Build with: `cmake -DDIJKSTRA_BENCH_BGL=ON -DBGL_INCLUDE_DIR=/path/to/boost`.
+
+### Results — n = 100 000 (3-run average)
+
+| Topology | graph-v3 default (CSR) | graph-v3 Idx8 (CSR) | BGL CSR | BGL adjacency_list |
+|----------|------------------------:|---------------------:|---------:|--------------------:|
+| ER Sparse (E/V≈8) | 26.2 ms | **22.9 ms** | **19.9 ms** | 34.2 ms |
+| BA (E/V≈8)        | 26.9 ms | **21.7 ms** | **19.6 ms** | 30.9 ms |
+| Grid (E/V≈4)      | **6.2 ms** | 8.9 ms | **6.1 ms** | 9.9 ms |
+| Path (E/V=1)      | **0.270 ms** | 0.329 ms | **0.283 ms** | 0.522 ms |
+
+### Observations
+
+| Observation | Detail |
+|-------------|--------|
+| **graph-v3 beats BGL `adjacency_list` on every topology** | Default heap beats BGL adj by 23–48% (ER 26.2 vs 34.2 ms, BA 26.9 vs 30.9, Grid 6.2 vs 9.9, Path 0.27 vs 0.52). Reflects graph-v3's iterator-based edge layout vs BGL's property-map indirection. |
+| **BGL CSR is the fastest CSR on dense graphs** | BGL CSR wins ER (-13% vs Idx8) and BA (-10% vs Idx8). BGL CSR uses a 4-ary indexed heap natively *and* a tighter CSR layout — that combination still has an edge over our Idx8. |
+| **graph-v3 ties BGL CSR on grid and path** | Grid: 6.16 vs 6.05 ms (+2%). Path: 0.270 vs 0.283 ms (-5% — graph-v3 default actually faster). At low E/V the heap implementation no longer matters; layout cost dominates and the two CSR layouts are equivalent. |
+| **Idx8 closes most of the gap to BGL CSR** | On ER and BA, Idx8 is within 13–15% of BGL CSR (vs 32–37% for default). Switching to Idx8 captures the bulk of the dense-graph win available from a true decrease-key heap. |
+| **Adjacency-list comparison validates default-heap choice** | Against the closer-equivalent `adjacency_list` container, graph-v3 with the default heap is 23–48% faster across all four topologies — confirming that the Phase 4.2 decision (keep `use_default_heap`) does not reflect a missing-feature gap vs BGL. |
+
+### Conclusion
+
+- For random / scale-free workloads on CSR, BGL's CSR + native indexed heap is
+  ~10–15% faster than graph-v3's `compressed_graph` + `use_indexed_dary_heap<8>`.
+  The remaining gap is plausibly attributable to BGL's CSR using `boost::vec_adj_list_traits`
+  edge-property layout (hot-path arrays packed differently from our edge descriptors).
+- For low-E/V workloads (grid, path) graph-v3 ties or beats BGL CSR with the
+  default heap — there is no missing optimisation here.
+- Against `boost::adjacency_list` (the closer match for `dynamic_graph<vov>`),
+  graph-v3 wins on every topology measured.
+- No further heap changes recommended on the strength of these results: the
+  Phase 4.2 decision (default = `use_default_heap`, opt-in `use_indexed_dary_heap<8>`)
+  remains the right configuration. Future work, if the dense-CSR gap matters
+  to a user, is in CSR layout, not in the heap.
+
+### Phase 4.3a — Apples-to-apples re-run with Idx4 (resolves Open Q6)
+
+BGL's `dijkstra_shortest_paths` hard-codes `d_ary_heap_indirect<Vertex, 4, ...>`
+(see `boost/graph/dijkstra_shortest_paths.hpp`). The Phase 4.3 table compared
+against graph-v3 Idx8, leaving open whether the dense-CSR gap was an arity
+artefact. Re-run on the same machine, n = 100 000, 3-run averages of CPU time
+(`--benchmark_min_time=2s`):
+
+| Topology | graph-v3 Idx4 | graph-v3 Idx8 | BGL CSR | Idx4 vs BGL | Idx8 vs BGL |
+|----------|--------------:|--------------:|--------:|-----------:|-----------:|
+| ER Sparse (E/V≈8) | 22.77 ms | 22.38 ms | 21.14 ms | +7.7% | +5.9% |
+| BA (E/V≈8)        | 21.99 ms | 21.02 ms | 20.10 ms | +9.4% | +4.6% |
+| Grid (E/V≈4)      |  8.64 ms |  8.77 ms |  6.33 ms | +36.5% | +38.5% |
+| Path (E/V=1)      |  0.330 ms| 0.329 ms | 0.287 ms | +15.0% | +14.6% |
+
+**Findings**
+
+- **Arity is not the bottleneck.** Switching to Idx4 (apples-to-apples with
+  BGL) does not narrow the gap: ER and BA stay within 1–3 percentage points
+  of the Idx8 number; Grid is unchanged at ~37%; Path is unchanged at ~15%.
+  On BA, Idx8 is actually a touch faster than Idx4 (+4.6% vs +9.4%) — BA's
+  power-law degree distribution rewards wider arity (fewer levels per
+  decrease-key on high-degree hubs).
+- **The gap is uniform across topologies that exercise the heap very
+  differently.** Grid (uniform degree 4, predictable heap pattern) shows the
+  biggest absolute gap (36–38%). If the heap were the bottleneck the gap
+  would track topology, not be uniform across them.
+- **Suspect: weight-map indirection in the relax loop.** BGL's
+  `get(&edge_prop::weight, g)` typically resolves to a raw `Weight*` indexed
+  by edge offset (zero indirection). graph-v3's `edge_value(g, uv)`
+  goes through the iterator's `value()` accessor on the `compressed_graph`'s
+  edge-property storage, which may add a level of pointer-chasing or block
+  auto-vectorisation. Verifying this requires `perf stat -e
+  L1-dcache-load-misses,branch-misses` or `perf record` profiling and is
+  out of scope for this plan.
+- **Recommendation: no further heap work.** The Phase 4.2 decision stands
+  (default = `use_default_heap`, opt-in `use_indexed_dary_heap<D>`). For
+  dense workloads on CSR, prefer Idx8 over Idx4 (consistently 1–5
+  percentage points faster on ER and BA at n = 100 K). If the dense-CSR
+  gap to BGL becomes important to a user, the next investigation belongs
+  in `compressed_graph`'s edge-value access path, not in the heap.
+
+---
+
+## Phase 4.3b — CSR access-path profiling (Phase 1 of `csr_edge_value_perf_plan.md`)
+
+Captured: 2026-04-26  
+Goal: Quantify how the Idx4-vs-BGL CSR gap scales with `n`, to discriminate between
+work-bound (constant per-edge overhead) and memory-bound (gap widens with `n` as the
+working set spills out of cache) hypotheses.
+
+### Setup
+
+| Knob | Value |
+|------|-------|
+| Binary | `build/linux-gcc-release/benchmark/algorithms/benchmark_dijkstra` |
+| Flags | `--benchmark_min_time=2s` |
+| Pinning | `taskset -c 4` (single physical core) |
+| Runs | 5 (median CPU time reported) |
+| Sizes | `n ∈ {10 000, 100 000}` (only sizes registered by the benchmark) |
+| Topologies | BA, ER_Sparse, Grid (4-regular), Path |
+
+Working-set fit (vertices × 4B + edges × ~16B):
+- `n = 10 000`, BA/ER (~10 edges/v): ~0.6 MB → fits in L2 (1.28 MB) ✅
+- `n = 100 000`, BA/ER: ~6 MB → spills to L3 (25 MB) ✅
+- `n = 100 000`, Grid (4 edges/v): ~2.4 MB → fits in L3 ✅
+- All sizes fit in L3.
+
+### 1.1 Multi-size baseline (median of 5 runs, CPU ns)
+
+| Topology | n | Idx4 | Idx8 | BGL_CSR | Idx4/BGL | Idx8/BGL |
+|----------|---:|------:|------:|---------:|---------:|---------:|
+| BA        | 10 000 |  1 185 000 |  1 179 420 |    990 450 | **1.196×** | 1.191× |
+| ER_Sparse | 10 000 |  1 192 860 |  1 169 130 |    987 541 | **1.208×** | 1.184× |
+| Grid      | 10 000 |    624 630 |    628 727 |    454 421 | **1.375×** | 1.384× |
+| Path      | 10 000 |     33 531 |     33 760 |     28 204 | **1.189×** | 1.197× |
+| BA        | 100 000 | 23 651 400 | 22 222 500 | 21 449 200 | **1.103×** | 1.036× |
+| ER_Sparse | 100 000 | 23 318 800 | 23 397 300 | 22 184 000 | **1.051×** | 1.055× |
+| Grid      | 100 000 |  8 701 240 |  8 909 080 |  6 359 360 | **1.368×** | 1.401× |
+| Path      | 100 000 |    336 510 |    332 385 |    289 129 | **1.164×** | 1.150× |
+
+### Scaling table — does the Idx4-vs-BGL gap grow with `n`?
+
+| Topology  | gap @ 10 K | gap @ 100 K | Δ (pp) |
+|-----------|-----------:|------------:|-------:|
+| BA        |     1.196× |      1.103× |  −9.4  |
+| ER_Sparse |     1.208× |      1.051× | −15.7  |
+| Grid      |     1.375× |      1.368× |  −0.6  |
+| Path      |     1.189× |      1.164× |  −2.5  |
+
+### Interpretation
+
+- The Idx4-vs-BGL gap **does not grow** with `n` for any topology, even as the working
+  set crosses from L2 (10 K) to L3 (100 K). For BA and ER it actually **shrinks** by
+  9–16 percentage points, consistent with a fixed setup/initialisation cost
+  (allocator warmup, position-map alloc, vector grow) being amortised over the larger
+  workload.
+- This is **inconsistent with a memory-bound hypothesis** (suspect 2: cold edge_value
+  cache lines, suspect 5: prefetch). If the gap were caused by extra cache traffic
+  per edge it would widen with `n`, not stay flat or shrink.
+- The Grid topology shows the **most stable** gap (~1.37× at both sizes). Grid is
+  4-regular with a deterministic stencil pattern, so it has the cleanest per-edge
+  signal and the smallest overhead-amortisation effect. **This is the topology to
+  focus subsequent profiling on.**
+- **Working hypothesis after Phase 1.1**: the gap is **work-bound** — extra
+  instructions executed per edge in the relax inner loop (suspect 1: pointer-subtract
+  + extra load to reach `edge_value_`, suspect 3: `target_id` two-hop, or suspect 4:
+  `compressed_graph::find_vertex`). Phase 1.2/1.3 (perf counters + perf annotate) are
+  needed to confirm by checking `instructions/edge` and miss rates.
+
+### 1.2 Hardware counters — **DEFERRED**
+
+`/usr/lib/linux-tools-6.8.0-110/perf` is installed but Linux PMC events are
+`<not supported>` under this WSL2 kernel. Enabling requires
+`nestedVirtualization=true` in `%UserProfile%\.wslconfig` followed by
+`wsl --shutdown`. Once available, the counters to capture are:
+
+```
+perf stat -e cycles,instructions,L1-dcache-load-misses,LLC-load-misses,\
+  branch-misses,branch-instructions \
+  taskset -c 4 ./benchmark_dijkstra \
+  --benchmark_filter='^BM_Dijkstra_(CSR_Grid_Idx4|BGL_CSR_Grid)/100000$' \
+  --benchmark_min_time=2s
+```
+
+Compute and tabulate: IPC, cycles/edge, loads/edge, L1-D miss rate, LLC miss
+rate, branch-miss rate. Repeat for `ER_Sparse` to confirm.
+
+**Predicted outcome (from 1.1):** Idx4 will show *more instructions/edge* with
+*similar miss rates* — the work-bound signature.
+
+### 1.3 perf record + annotate — **DEFERRED** (same WSL2 PMC limitation)
+
+### Verdict (preliminary, pending 1.2/1.3 confirmation)
+
+**Memory-bound hypothesis ruled out by the scaling test.** The flat-or-shrinking
+gap across an L2→L3 transition leaves only the work-bound suspects from the plan:
+1, 3, and 4. Phase 2 (disassembly diff) of `csr_edge_value_perf_plan.md` can
+proceed in parallel with 1.2/1.3 and may itself be sufficient to identify the
+extra-instruction site.
+
+---
+
+## Phase 4.3b (Windows) — VTune software-mode hotspots, Grid_Idx4/100K (MSVC)
+
+Captured: 2026-04-26
+Binary: `build/windows-msvc-relwithdebinfo/benchmark/algorithms/benchmark_dijkstra.exe`
+(MSVC 19.50.35729, `/O2 /Ob2 /Zi` from `windows-msvc-relwithdebinfo` preset)
+Tool: Intel VTune Profiler 2025.10.0 (build 631836), `-collect hotspots -knob sampling-mode=sw`
+Result dir: `build/vtune/hotspots_grid_idx4_100k_msvc_001` (raw .vtune dir, gitignored)
+Command: `benchmark_dijkstra.exe --benchmark_filter=^BM_Dijkstra_CSR_Grid_Idx4/100000$ --benchmark_min_time=15s --benchmark_repetitions=1`
+CPU: Intel Alder Lake-S, 12C / 20T, base 3.61 GHz; 30 s sample window.
+
+### Why software-mode (no µarch breakdown yet)
+
+VTune's hardware event-based sampling (`-collect uarch-exploration`) needs
+either the SEP sampling driver installed or the collector running as
+Administrator. The current Windows session has neither, so this run uses
+user-mode sampling. Result: per-function and per-source-line CPU-time
+attribution, but **no Front-End / Bad-Speculation / Back-End-Memory /
+Retiring breakdown**. The µarch run is deferred — see "Next" at the end.
+
+### Top hotspots (function level)
+
+| Rank | CPU time | % of total | Symbol |
+|------|---------:|-----------:|--------|
+| 1 | 9088 ms | 31.2 % | `indexed_dary_heap<...,vector_position_map,4,...>::sift_down_` |
+| 2 | 3692 ms | 12.7 % | `std::less<double>::operator()` (1st copy) |
+| 3 | 2768 ms |  9.5 % | `container_value_fn<vector<double>>::operator()` |
+| 4 | 1511 ms |  5.2 % | `vector<double>::operator[]` (distance buffer) |
+| 5 | 1294 ms |  4.4 % | dijkstra `relax_target` lambda body |
+| 6 | 1277 ms |  4.4 % | `incidence_view::iterator::operator*` |
+| 7 |  926 ms |  3.2 % | `std::less<double>::operator()` (2nd copy) |
+| 8 |  783 ms |  2.7 % | `vector_iterator<csr_col<uint>>::operator++` |
+| 9 |  751 ms |  2.6 % | dijkstra `run` lambda body |
+| 10 |  704 ms |  2.4 % | `indexed_dary_heap<...>::sift_up_` |
+| 11 |  478 ms |  1.6 % | `std::less<double>::operator()` (3rd copy) |
+| 12 |  461 ms |  1.6 % | `vector<unsigned int>::operator[]` |
+
+Total of the twelve = 23.7 s out of 30 s observed CPU time = **79 %** of the work.
+
+### Top source lines (where the cycles actually go)
+
+| CPU time | File:Line | What it is |
+|---------:|-----------|------------|
+| 5417 ms | `type_traits:2388` | (libstdc++ MSVC STL `std::less::operator()` impl) |
+| 5309 ms | `indexed_dary_heap.hpp:228` | `sift_down_` inner child-comparison loop |
+| 3549 ms | `traversal_common.hpp:188` | `container_value_fn::operator()` returning `c[uid]` |
+| 2332 ms | `indexed_dary_heap.hpp:234` | `sift_down_` "smallest child vs k" check |
+| 1526 ms | `vector:1934` | `vector<double>::operator[]` |
+| 1277 ms | `incidence.hpp:180` | edge descriptor materialisation |
+|  925 ms | `indexed_dary_heap.hpp:238` | `sift_down_` `i = best` advance |
+|  818 ms | `dijkstra_shortest_paths.hpp:252` | `w_uv = weight(g, uv)` in relax lambda |
+|  783 ms | `vector:287`  | iterator `operator++` |
+|  655 ms | `indexed_dary_heap.hpp:185` | `place_` writing `heap_[i] = k` |
+|  386 ms | `dijkstra_shortest_paths.hpp:465` | `relax_target(uv, uid)` call site |
+
+### Headline finding — **MSVC is not inlining what GCC inlined**
+
+The Linux/GCC analysis (Open Question 3 in this document) verified by
+disassembly that under both `-O3` and `-O2`:
+
+- `std::less<double>::operator()` collapses to a single `ucomisd`.
+- `container_value_fn::operator()` collapses to a `double*` indexed load.
+- `sift_up_` / `sift_down_` are fully inlined into the run lambda.
+
+Under MSVC `/O2` on the same source, **none of those collapses happen**:
+
+- `std::less<double>::operator()` appears as **three distinct callable
+  symbols** consuming **5096 ms = 17.5 %** of total CPU time.
+- `container_value_fn::operator()` is a real call (2768 ms = 9.5 %).
+- `sift_down_` is a real, non-inlined function consuming 9088 ms = **31.2 %**
+  on its own. Combined with `sift_up_` and the heap update path, indexed-heap
+  bookkeeping eats over a third of the workload.
+
+This **revises the Phase 4.3a diagnosis on MSVC** (the original was
+GCC-specific):
+
+- On GCC the heap is fully inlined and the residual gap to BGL is in the
+  edge-value access path (`edge_value(g, uv)` → `edge_value_[k]`).
+- On MSVC the heap `sift_down_` alone outweighs everything else — and three
+  copies of `std::less` not being merged is a known MSVC ABI behaviour
+  (each lambda capturing the comparator gets its own instantiation).
+
+### Implications for the original perf plan
+
+| Item from `csr_edge_value_perf_plan.md` | Status under MSVC |
+|---|---|
+| Phase 1.4 verdict — work-bound vs memory-bound | Software sampling can't classify; needs HW counters. |
+| Open Q3 — `compare_(distance_(...))` collapses | **Holds for GCC, fails for MSVC.** Multiple `std::less` symbols visible. |
+| Phase 2 — disassembly comparison | Even more important now: MSVC asm of `sift_down_` is the first thing to look at. |
+| Phase 4 candidate fix #1 — offset-aware `edge_value` | Less promising on MSVC (the heap dominates, not the edge access). |
+| Phase 4 candidate fix #2 — `incidence` fast path | Still relevant; `incidence_view::iterator::operator*` is 1277 ms. |
+| New MSVC-specific candidate | **Force-inline / hoist the comparator.** `__forceinline` or a wrapper that takes the captured `std::less<double>` by value and inlines its operator. |
+| New MSVC-specific candidate | **Inline `sift_down_`.** Annotate with `__forceinline` on MSVC; with arity 4 the body is small enough that this is profitable. Verify by re-profiling. |
+
+### Next
+
+| Step | What | Why |
+|---|---|---|
+| 1 | Re-run as Administrator (or install SEP driver) with `-collect uarch-exploration` | Get Front-End / Back-End-Memory / Retiring breakdown — confirms whether the call overhead is back-end-core (real work) or back-end-memory (data stalls). |
+| 2 | Disassemble `sift_down_` at MSVC `/O2` (VS Disassembly window from a debug-attached run) | Confirm the function is genuinely a separate call frame, not a thunk that just shows up in symbol-time accounting. |
+| 3 | Spike `__forceinline` on `sift_down_`, `sift_up_`, `less_than_`, `place_` | Cheap experiment; rerun the same hotspots collection and compare. If the heap symbols disappear from the top-12 and total time drops, this is the win. |
+| 4 | Only after the heap is inlined, retry the GCC-style edge-value-access investigation in `csr_edge_value_perf_plan.md` | The original diagnosis assumed an inlined heap; that assumption is invalid on MSVC until step 3. |
+
+---
+
+## Phase 4.3c — `GRAPH_DETAIL_FORCE_INLINE` spike results (MSVC, Grid_Idx4/100K)
+
+**Date:** 2026-04-27  
+**Branch:** `indexed-dary-heap`  
+**Build:** `windows-msvc-relwithdebinfo` (`/O2 /Ob2 /Zi`)  
+**VTune result:** `vtune/hotspots_grid_idx4_100k_msvc_004`  
+**Filter:** `BM_Dijkstra_CSR_Grid_Idx4/100000`, `--benchmark_min_time=15s`  
+**Collector:** software-mode sampling (~29 s wall-clock, 28.58 s CPU collected)
+
+### Changes applied
+
+```
+// GRAPH_DETAIL_FORCE_INLINE macro (MSVC → __forceinline, GCC/Clang → [[gnu::always_inline]] inline)
+// Applied to:
+//   place_()      — single write-point for heap_[i] + position map update
+//   less_than_()  — comparator choke point; sift_up_ / sift_down_ now call this
+//                   instead of compare_(distance_(a), distance_(b)) directly
+// NOT applied to sift_up_ / sift_down_ (bodies too large; would bloat call sites)
+// NOT applied to parent_of_ / first_child_of_ (static constexpr — always inlined)
+```
+
+### Top-15 hotspot comparison (CPU time, seconds)
+
+| Rank | Baseline (004 pre-spike / result 001) | CPU (s) | % | Post-spike (result 004) | CPU (s) | % | Δ % |
+|------|---------------------------------------|---------|---|-------------------------|---------|---|-----|
+| 1  | `heap::sift_down_`                      | 9.09 | 31.2 | `heap::sift_down_`                       | 8.99 | 31.5 | ≈0   |
+| 2  | `std::less<double>::operator()` (×1)    | 3.69 | 12.7 | `std::less<double>::operator()` (×1)    | 3.93 | 13.8 | +1.1 |
+| 3  | `container_value_fn::operator()`        | 2.77 |  9.5 | `container_value_fn::operator()`        | 2.86 | 10.0 | +0.5 |
+| 4  | `vector<double>::operator[]`            | —    |   — | `vector<double>::operator[]`            | 1.98 |  6.9 | new  |
+| 5  | dijkstra `relax` lambda                 | 1.28 |  4.4 | dijkstra lambda                         | 1.23 |  4.3 | ≈0   |
+| 6  | `incidence_view` iterator               | 1.28 |  4.4 | `std::less` (2nd copy)                  | 0.86 |  3.0 | —    |
+| 7  | `std::less` (2nd copy)                  | 0.93 |  3.2 | `incidence_view` iterator               | 0.83 |  2.9 | ≈0   |
+| 8  | `heap::sift_up_`                        | 0.80 |  2.7 | `heap::sift_up_`                        | 0.80 |  2.8 | ≈0   |
+| 9–15 | (various vector/iterator helpers)     | —    |  —  | (similar mix)                           | —    |  —  | ≈0   |
+
+### Interpretation
+
+**The spike had no measurable effect.** The profile is essentially identical:
+
+- `sift_down_` remains the top symbol at ~31 % whether or not `less_than_` / `place_` are force-inlined.
+- `std::less<double>::operator()` still appears as multiple separate call frames (~17 % combined).
+- `container_value_fn::operator()` is still a real non-inlined call (~10 %).
+
+This means **MSVC is not honouring `__forceinline` on `less_than_` and `place_` when called from inside `sift_down_`**, which is itself a separate, non-inlined function. The root cause is that `sift_down_` is not inlined into its call sites — so its callee force-inline annotations are local to its own body and do not collapse the full chain that GCC collapses.
+
+### Revised diagnosis
+
+The key missing piece is inlining `sift_down_` (and `sift_up_`) into the Dijkstra run-lambda. Until that happens, `__forceinline` on the inner helpers only affects calls *within* the sift body, which may already be inlined there; it does not help the outer symbol boundary.
+
+---
+
+## Phase 4.3d — `GRAPH_DETAIL_FORCE_INLINE` on `sift_down_` + `sift_up_` (MSVC)
+
+**Date:** 2026-04-27  
+**VTune result:** `vtune/hotspots_grid_idx4_100k_msvc_005`  
+**Change:** Added `GRAPH_DETAIL_FORCE_INLINE` to `sift_down_` and `sift_up_` declarations.  
+**Total CPU collected:** 28.38 s (vs 28.58 s in 004 — effectively identical)
+
+### Top-15 hotspots (result 005)
+
+| Rank | Function | CPU (s) | % |
+|------|----------|---------|---|
+| 1  | `heap::sift_down_`                          | 9.44 | 33.2 |
+| 2  | `std::less<double>::operator()` (1st)       | 4.09 | 14.4 |
+| 3  | `container_value_fn::operator()`            | 2.50 |  8.8 |
+| 4  | `vector<double>::operator[]`                | 1.61 |  5.7 |
+| 5  | dijkstra relax lambda                       | 1.25 |  4.4 |
+| 6  | `std::less<double>::operator()` (2nd)       | 1.06 |  3.7 |
+| 7  | `incidence_view` iterator `operator*`       | 0.92 |  3.2 |
+| 8  | `vector<unsigned>::operator[]`              | 0.67 |  2.4 |
+| 9  | `heap::sift_up_`                            | 0.57 |  2.0 |
+| 10 | `vector<unsigned>::push_back`               | 0.42 |  1.5 |
+| 11 | `heap::place_`                              | 0.33 |  1.1 |
+| 12 | `container_value_fn::operator()` (2nd)      | 0.31 |  1.1 |
+| 13 | `_Vector_iterator::operator++`              | 0.28 |  1.0 |
+| 14 | `vector<unsigned>::size`                    | 0.28 |  1.0 |
+| 15 | `heap::pop`                                 | 0.27 |  1.0 |
+
+### Interpretation
+
+**`__forceinline` on `sift_down_` is also ineffective.** MSVC silently ignores the annotation — `sift_down_` still appears as a distinct 9.4 s (33.2%) real call frame. The profile is statistically indistinguishable from results 004 (pre-sift annotation). MSVC's inliner is making a size-based refusal that `__forceinline` does not override for a function of this complexity when the call site is itself a complex template instantiation.
+
+**Key conclusion:** `__forceinline` / `[[gnu::always_inline]]` annotations alone are not sufficient to close the MSVC vs GCC inlining gap for `sift_down_`. A different approach is needed.
+
+### Candidate next approaches
+
+| Priority | Approach | Rationale |
+|----------|----------|-----------|
+| **High** | Increase `/Ob` (inline depth) — try `/Ob3` (available MSVC 19.26+) in the CMake release preset | Raises MSVC's inline budget per call site; may allow `sift_down_` to be inlined where `/Ob2` refuses |
+| High | Measure actual wall-clock ns before/after any change (not just symbol attribution) | Profile attribution is secondary; the benchmark median is the ground truth |
+| Medium | Manually hoist the `sift_down_` body into the Dijkstra run-lambda (proof-of-concept) | Establishes whether MSVC *can* produce the inlined shape at all and what the ceiling win is |
+| Medium | Profile with `/O2 /Ob3` release build and compare hotspot table | If `sift_down_` disappears from profile → the `/Ob` budget is the blocker |
+| Low | Elevate VTune `uarch-exploration` (admin / SEP driver) | Front-End/Back-End breakdown is only useful once the symbol boundary is resolved |
+
+---
+
+## Phase 4.3e — `/Ob3` + `GRAPH_DETAIL_FORCE_INLINE` on `sift_down_` (MSVC)
+
+**Date:** 2026-04-27  
+**Branch:** `indexed-dary-heap`  
+**Build:** `windows-msvc-release` (`/O2 /Ob3 /DNDEBUG`, no PDB)  
+**VTune result:** `vtune/hotspots_grid_idx4_100k_msvc_ob3_001`  
+**Filter:** `BM_Dijkstra_CSR_Grid_Idx4/100000`, `--benchmark_min_time=15s`  
+**Collector:** software-mode sampling (~29 s wall-clock, 28.97 s CPU collected)
+
+### Changes applied (on top of Phase 4.3d state)
+
+```
+// CMakePresets.json — windows-msvc-release:
+"CMAKE_CXX_FLAGS_RELEASE": "/O2 /Ob3 /DNDEBUG"   // was /O2 /Ob2 /DNDEBUG
+
+// indexed_dary_heap.hpp:
+GRAPH_DETAIL_FORCE_INLINE void sift_down_(size_type i)   // re-annotated
+```
+
+### VTune hotspot result
+
+| Rank | Function | CPU (s) | % |
+|------|----------|---------|---|
+| 1 | `func@0x1400041d0` (inlined run-lambda) | 28.62 | **98.8** |
+| 2–9 | misc CRT / allocator / timer helpers | 0.31 | 1.2 |
+
+**`sift_down_`, `sift_up_`, `std::less`, `container_value_fn`, `place_` — all gone from the profile.** 98.8% of CPU time is a single anonymous call frame, which is the Dijkstra run-lambda with the heap fully inlined into it. This is the same profile shape GCC produces at `-O2`.
+
+**Conclusion:** `/Ob3` + `GRAPH_DETAIL_FORCE_INLINE` on `sift_down_` is the combination that closes the MSVC inlining gap. Neither alone was sufficient (Phase 4.3d showed `__forceinline` alone had no effect at `/Ob2`).
+
+### Wall-clock medians (5 reps, `windows-msvc-release`)
+
+| Benchmark | `/Ob2` baseline (ns) | `/Ob3` + FI (ns) | Δ |
+|-----------|---------------------:|-----------------:|---|
+| Grid_Idx4/1K   |     — |    26,444 | — |
+| Grid_Idx4/10K  |     — |   562,731 | — |
+| Grid_Idx4/100K | 6,873,101 | 7,485,252 | **+8.9%** (regression) |
+| Path_Idx4/1K   |     — |     4,186 | — |
+| Path_Idx4/10K  |     — |    42,927 | — |
+| Path_Idx4/100K | 498,438 | 424,007 | **−14.9%** (win) |
+
+> Grid_Idx4/100K baseline was from `/Ob2` `relwithdebinfo` (with PDB/debug info overhead); the `/Ob3` release build without PDB is the fair comparison. The +8.9% regression on Grid may be noise or code-layout change. Path shows a clear −14.9% win — consistent with the profile showing the comparator chain now collapses.
+
+### Next steps
+
+| Priority | Step |
+|----------|------|
+| **High** | Run the full Grid/Path/ER/BA suite at `/Ob3` release and compare against the `/Ob2` baseline table in `indexed_dary_heap_baseline_msvc.md` — confirm win is consistent or isolate regressions |
+| High | Commit `/Ob3` + `GRAPH_DETAIL_FORCE_INLINE sift_down_` as the permanent MSVC configuration if the full suite shows no regression |
+| Medium | Proceed to Thread B: CSR edge-value access path gap vs BGL (`csr_edge_value_perf_plan.md`) — now the heap is inlined on both GCC and MSVC, the original GCC-measured gap is the next target |
diff --git a/agents/dary_heap/perf_capture_manifest.txt b/agents/dary_heap/perf_capture_manifest.txt
new file mode 100644
index 0000000..0e6e872
--- /dev/null
+++ b/agents/dary_heap/perf_capture_manifest.txt
@@ -0,0 +1,48 @@
+# Capture manifest for Phase 2/3 of csr_edge_value_perf_plan.md.
+#
+# Each non-comment, non-blank line: <basename> <length_hex> <regex> [substrings...]
+#   - <regex>  is matched against the demangled symbol name (use this for
+#              patterns containing < or > to avoid cmd redirection issues).
+#   - subsequent substrings are AND-filtered after the regex.
+#
+# These artefacts give the Linux/GCC investigation a per-symbol reference
+# baseline to compare against. WSL has no hardware counters, so disassembly
+# diffs replace the perf-stat workflow.
+#
+# Run with:
+#   python scripts/perf/capture_asm.py \
+#     --exe build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe \
+#     --manifest agents/perf_capture_manifest.txt \
+#     --out-dir artifacts/perf/msvc_profile
+
+# --- heap sift_down_ in both graph backends ---
+sift_down_csr_idx2     0x300  use_indexed_dary_heap.2.   sift_down_   compressed_graph
+sift_down_csr_idx4     0x300  use_indexed_dary_heap.4.   sift_down_   compressed_graph
+sift_down_csr_idx8     0x300  use_indexed_dary_heap.8.   sift_down_   compressed_graph
+sift_down_vov_idx4     0x300  use_indexed_dary_heap.4.   sift_down_   dynamic_graph
+
+# --- heap sift_up_ for completeness (much smaller) ---
+sift_up_csr_idx4       0x180  use_indexed_dary_heap.4.   sift_up_     compressed_graph
+
+# --- BGL counterparts (for parity with future GCC results) ---
+# Boost's d_ary_heap_indirect uses preserve_heap_property_{down,up} methods.
+# Two instantiations each: [0] is for compressed_sparse_row (vertex id u32),
+# [1] is for adjacency_list (vertex id u64). We want the CSR one.
+bgl_dary_sift_down_csr:0   0x500  preserve_heap_property_down   d_ary_heap_indirect
+bgl_dary_sift_up_csr:0     0x300  preserve_heap_property_up     d_ary_heap_indirect
+
+# --- per-edge work suspects identified by VTune ---
+# (vtune ranked these 4-6 in Phase 2 hotspot table)
+# NOTE: incidence_view::iterator::operator* has no standalone body in this
+# build - MSVC inlined it everywhere; only @ILT thunks remain. The behaviour
+# is captured inside dijkstra_csr_idx{2,4,8} below.
+container_value_fn:0   0x100  container_value_fn         operator
+
+# --- the main Dijkstra body (CSR + Idx{2,4,8}) - the actual relax loop lives here ---
+# All three Dijkstra entries match the same regex; pick by --pick if needed.
+dijkstra_csr_idx2:0    0x320  ^graph::dijkstra_shortest_paths.*compressed_graph   use_indexed_dary_heap
+dijkstra_csr_idx4:1    0x320  ^graph::dijkstra_shortest_paths.*compressed_graph   use_indexed_dary_heap
+dijkstra_csr_idx8:2    0x320  ^graph::dijkstra_shortest_paths.*compressed_graph   use_indexed_dary_heap
+
+# --- BGL Dijkstra equivalent for direct comparison ---
+dijkstra_bgl_csr       0x800  dijkstra_shortest_paths_no_color_map_no_init        compressed_sparse_row
\ No newline at end of file
diff --git a/agents/dary_heap/perf_capture_manifest_linux.txt b/agents/dary_heap/perf_capture_manifest_linux.txt
new file mode 100644
index 0000000..0cb8a81
--- /dev/null
+++ b/agents/dary_heap/perf_capture_manifest_linux.txt
@@ -0,0 +1,33 @@
+# Linux/GCC capture manifest, parallels agents/perf_capture_manifest.txt.
+#
+# Differences from the MSVC manifest:
+#   - Demangled-name shape is the GCC/Itanium one, not MSVC.
+#   - GCC uses `Nul` (unsigned long) for non-type template args, not `Nu`.
+#   - GCC inlines the sift_down_/sift_up_ helpers AND BGL's
+#     preserve_heap_property_down/up entirely into the dijkstra body.
+#     There are no standalone bodies for those, so this manifest captures
+#     the enclosing dijkstra body instead — that is where the inlined
+#     instructions actually live, and that is what is directly comparable
+#     across toolchains.
+#   - graph-v3's dijkstra body lives inside an inner `{lambda(auto:1&)#1}::
+#     operator()` (the heap-using closure) — that closure is the body
+#     comparable to BGL's `dijkstra_shortest_paths_no_color_map_no_init`,
+#     which under GCC is fully inlined into `graph::benchmark::run_bgl_dijkstra`.
+#   - Sizes from `nm --print-size` are honoured; the length here is a fallback.
+#
+# Run via:
+#   python3 scripts/perf/objdump_capture.py \
+#     --exe build/linux-gcc-release/benchmark/algorithms/benchmark_dijkstra \
+#     --manifest agents/perf_capture_manifest_linux.txt \
+#     --out-dir artifacts/perf/linux_gcc
+
+# --- graph-v3 dijkstra bodies (heap is fully inlined into these) ---
+dijkstra_csr_idx2      0x800  use_indexed_dary_heap<2ul>   operator()  compressed_graph
+dijkstra_csr_idx4      0x800  use_indexed_dary_heap<4ul>   operator()  compressed_graph
+dijkstra_csr_idx8      0x800  use_indexed_dary_heap<8ul>   operator()  compressed_graph
+dijkstra_vov_idx4      0x800  use_indexed_dary_heap<4ul>   operator()  dynamic_graph
+
+# --- BGL dijkstra body (preserve_heap_property_{down,up} are inlined here) ---
+dijkstra_bgl_csr       0x900  run_bgl_dijkstra            compressed_sparse_row_graph
+dijkstra_bgl_adj       0x900  run_bgl_dijkstra            adjacency_list
+
diff --git a/agents/dary_heap/perf_linux_gcc_inventory.md b/agents/dary_heap/perf_linux_gcc_inventory.md
new file mode 100644
index 0000000..a1f788a
--- /dev/null
+++ b/agents/dary_heap/perf_linux_gcc_inventory.md
@@ -0,0 +1,123 @@
+# `artifacts/perf/linux_gcc/` — pre-collected Linux GCC reference
+
+Captured: 2026-04-28, branch `indexed-dary-heap`, host Titania (WSL2)
+Build: `linux-gcc-release` preset (g++ 13.x, `-O3 -DNDEBUG`,
+`DIJKSTRA_BENCH_BGL=ON`, BGL at `/home/phil/dev_graph/boost`)
+
+This directory is **gitignored** (it lives under `artifacts/`), but the
+file inventory below is committed so a future session — or a Windows
+session diff'ing against `msvc_profile/` — knows what artefacts to
+expect. Re-generate on Linux/WSL with:
+
+```bash
+cmake --preset linux-gcc-release \
+      -DDIJKSTRA_BENCH_BGL=ON \
+      -DBGL_INCLUDE_DIR=/home/phil/dev_graph/boost
+cmake --build --preset linux-gcc-release -j --target benchmark_dijkstra
+
+bash scripts/perf/linux_gcc_capture.sh
+```
+
+The script drives `scripts/perf/bench_run.py` (wall-clock), `perf stat`
+(software events only — WSL has no PMU), and
+`scripts/perf/objdump_capture.py` (per-symbol asm via the manifest at
+`agents/perf_capture_manifest_linux.txt`).
+
+## WSL-specific constraints
+
+- Hardware PMU events (`cache-misses`, `LLC-load-misses`,
+  `L1-dcache-load-misses`, `cycles` for `perf record`) silently fail
+  or return zero on WSL2. The capture script attempts a
+  software-only set (`task-clock,context-switches,page-faults,
+  cpu-migrations,instructions:u,cycles:u`); on this host even those
+  return non-zero from `perf stat`, so the `perfstat_*` files exist
+  but should not be relied on. Wall-clock + objdump are the primary
+  signals on this side.
+- Hardware-counter analysis (cache miss rates, frontend stalls,
+  branch mispredict ratios) was done on Windows under VTune; results
+  in `artifacts/perf/msvc_profile/{hotspots,callstacks}.csv`.
+
+## Inventory
+
+| File | Lines / size | Description |
+|------|-------------:|-------------|
+| `wallclock_baseline.json`        | 96 rows | bench_run.py JSON, 24 benches × 4 aggregates |
+| `diff_msvc_vs_gcc.md`            | 26      | Cross-toolchain markdown table from `bench_compare.py` |
+| `perfstat_*.{stdout,stderr}`     | 8 files | `perf stat` software events (PMU N/A; informational only) |
+| **graph-v3 algorithm bodies**    |         | (sift_down_/sift_up_/comparator are *fully inlined* — no separate symbols) |
+| `dijkstra_csr_idx2.asm`          | 361     | Outer dijkstra closure, Idx2 (heap inlined) |
+| `dijkstra_csr_idx4.asm`          | 387     | Outer dijkstra closure, Idx4 (heap inlined) |
+| `dijkstra_csr_idx8.asm`          | 382     | Outer dijkstra closure, Idx8 (heap inlined) |
+| `dijkstra_vov_idx4.asm`          | 465     | Outer dijkstra closure, VoV Idx4 (heap inlined) |
+| **BGL counterparts**             |         |             |
+| `dijkstra_bgl_csr.asm`           | 412     | run_bgl_dijkstra<csr_graph> (full BGL body inlined) |
+| `dijkstra_bgl_adj.asm`           | 424     | run_bgl_dijkstra<adjacency_list> (full BGL body inlined) |
+
+### Symbols that have no standalone body under GCC
+
+Under `-O3`, GCC inlines all of these into the dijkstra closure they're
+called from. The MSVC build does emit standalone bodies; the inlined
+GCC instructions are folded into the dijkstra body counts above.
+
+| Symbol (MSVC name)                              | MSVC body | GCC |
+|-------------------------------------------------|----------:|-----|
+| graph-v3 `sift_down_` (per arity)               | 184–186   | inlined |
+| graph-v3 `sift_up_`                             |       109 | inlined |
+| graph-v3 `container_value_fn::operator()`       |        85 | inlined |
+| BGL `preserve_heap_property_down`               |       299 | inlined |
+| BGL `preserve_heap_property_up`                 |       204 | inlined |
+
+The absence is itself a real codegen data point — see
+`agents/csr_edge_value_perf_plan.md` Phase 2 Linux GCC.
+
+## Headline observation
+
+Treating "fully inlined dijkstra body" as the unit of comparison
+(MSVC sum = dijkstra body + sift_down_ + sift_up_; GCC = the single
+emitted closure body):
+
+|                            | graph-v3 Idx4 | BGL CSR | ratio |
+|----------------------------|--------------:|--------:|------:|
+| MSVC `/O2 /Ob3` (sum)      |           499 |  1,008  | 2.0×  |
+| GCC `-O3` (closure body)   |           387 |    412  | 1.06× |
+
+graph-v3's closure compresses ~22 % under GCC; BGL's compresses ~59 %.
+That delta is consistent with the wall-clock observation that on Linux
+GCC graph-v3 CSR Idx4 is **+15 % to +40 % slower than BGL CSR**, while
+on MSVC it is 34–64 % *faster*.
+
+## Gap-status verdict (decision tree from `thread_b_linux_runbook.md`)
+
+> graph-v3 still +30 %+ slower on Grid (the original 4.3a worst case)
+
+The Phase 4.3a Linux/GCC gap is **intact at HEAD**:
+
+| Topology  | 2025 4.3a Idx4 vs BGL | 2026-04-28 Idx4 vs BGL |
+|-----------|-----------------------|------------------------|
+| ER_Sparse | +7.7 %                | +14.7 % – +21.9 %      |
+| Grid      | +36.5 %               | **+36.2 % – +39.9 %**  |
+| BA        | +9.4 %                | +6.0 % – +18.8 %       |
+| Path      | +15.0 %               | +15.2 % – +15.6 %      |
+
+Phases 3–5 of `csr_edge_value_perf_plan.md` are un-deferred.
+
+## Manifest reference
+
+Capture targets are listed in `agents/perf_capture_manifest_linux.txt`
+(format: `basename[:N]  length_hex  regex  [substr ...]`). Differences
+from the MSVC manifest:
+
+- GCC mangling uses `Nul` for `unsigned long` non-type template args
+  (e.g. `use_indexed_dary_heap<4ul>`), not MSVC's `4u`.
+- The graph-v3 dijkstra body lives inside an inner closure
+  `{lambda(auto:1&)#1}::operator()`; the manifest matches it via the
+  combination of `use_indexed_dary_heap<Nul>` + `operator()` +
+  graph-type substring (`compressed_graph` / `dynamic_graph`).
+- BGL's body is captured via `run_bgl_dijkstra` +
+  `compressed_sparse_row_graph` / `adjacency_list`. The
+  `dijkstra_shortest_paths_no_color_map_no_init` regex from the MSVC
+  manifest matches no symbol on GCC (fully inlined).
+- `.cold` partitions exist for several of these. `nm --print-size`
+  reports only the hot partition; the cold partition is at a lower
+  address (see `BM_Dijkstra_*_Idx4(...) [clone .cold]` symbols) and is
+  not currently captured.
diff --git a/agents/dary_heap/perf_msvc_profile_inventory.md b/agents/dary_heap/perf_msvc_profile_inventory.md
new file mode 100644
index 0000000..e7fbf7d
--- /dev/null
+++ b/agents/dary_heap/perf_msvc_profile_inventory.md
@@ -0,0 +1,71 @@
+# `artifacts/perf/msvc_profile/` — pre-collected MSVC reference
+
+Captured: 2026-04-27, branch `indexed-dary-heap`, host Titania
+Build: `windows-msvc-profile` preset (`/O2 /Ob3 /Zi /DNDEBUG`, `/DEBUG`
+linker, `DIJKSTRA_BENCH_BGL=ON`, BGL at `D:/dev_graph/boost`)
+
+This directory is **gitignored** (it lives under `artifacts/`), but the
+file inventory below is committed so a Linux/GCC session knows what
+artefacts to compare against. Re-generate on Windows with:
+
+```pwsh
+# Wall-clock baseline (2-3 min)
+python scripts/perf/bench_run.py `
+  --exe build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe `
+  --filter "BM_Dijkstra_(CSR|BGL_CSR)_(ER_Sparse|Grid|BA|Path)(_Idx4)?/(10000|100000)$" `
+  --reps 5 --min-time 2s --label "windows-msvc-profile" `
+  --out artifacts/perf/msvc_profile/wallclock_baseline.json
+
+# Symbol disasm captures (~1 min after symbol-index cold call)
+python scripts/perf/capture_asm.py `
+  --exe build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe `
+  --manifest agents/perf_capture_manifest.txt `
+  --out-dir artifacts/perf/msvc_profile
+
+# VTune hotspots + callstacks (~30s collect + 5s report)
+& $vtune -collect hotspots -knob sampling-mode=sw `
+  -result-dir vtune/hot_grid_idx4_profile_001 -- `
+  build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe `
+  --benchmark_filter="BM_Dijkstra_CSR_Grid_Idx4/100000" --benchmark_min_time=15s
+& $vtune -report hotspots   -r vtune/hot_grid_idx4_profile_001 -format csv `
+  > artifacts/perf/msvc_profile/hotspots.csv
+& $vtune -report callstacks -r vtune/hot_grid_idx4_profile_001 -format csv `
+  > artifacts/perf/msvc_profile/callstacks.csv
+```
+
+## Inventory
+
+| File | Lines / size | Description |
+|------|-------------:|-------------|
+| `wallclock_baseline.json`         | 96 rows | bench_run.py JSON for 24 benchmarks × 4 aggregates |
+| `hotspots.csv`                    | 38 KB   | VTune software-mode hotspots, function-level CPU time |
+| `callstacks.csv`                  | 597 KB  | VTune callstack tree |
+| **graph-v3 heap**                 |         |             |
+| `sift_down_csr_idx2.asm`          | 186     | Idx2 heap sift-down body |
+| `sift_down_csr_idx4.asm`          | 184     | Idx4 heap sift-down body |
+| `sift_down_csr_idx8.asm`          | 186     | Idx8 heap sift-down body |
+| `sift_down_vov_idx4.asm`          | 191     | VoV Idx4 sift-down (control) |
+| `sift_up_csr_idx4.asm`            | 109     | Idx4 sift-up body |
+| **graph-v3 algorithm**            |         |             |
+| `dijkstra_csr_idx2.asm`           | 206     | Outer Dijkstra body, Idx2 |
+| `dijkstra_csr_idx4.asm`           | 206     | Outer Dijkstra body, Idx4 |
+| `dijkstra_csr_idx8.asm`           | 206     | Outer Dijkstra body, Idx8 |
+| `container_value_fn.asm`          | 85      | edge_value adapter |
+| **BGL counterparts**              |         |             |
+| `bgl_dary_sift_down_csr.asm`      | 299     | preserve_heap_property_down (CSR) |
+| `bgl_dary_sift_up_csr.asm`        | 204     | preserve_heap_property_up (CSR) |
+| `dijkstra_bgl_csr.asm`            | 505     | dijkstra_shortest_paths_no_color_map_no_init (CSR) |
+
+## Headline observation
+
+Line counts are a proxy for instruction count when comparing functions
+compiled at the same `/O2` level on the same toolchain. On MSVC:
+
+|                       | graph-v3 | BGL | ratio |
+|-----------------------|---------:|----:|------:|
+| Dijkstra body         |      206 | 505 |  2.5× |
+| sift_down_            |      184 | 299 |  1.6× |
+
+This is consistent with graph-v3 being **34-64 % faster than BGL** on
+MSVC profile (Phase 1.1 wall-clock data). The Linux side needs to confirm
+or refute the same ratio under GCC.
diff --git a/agents/dary_heap/thread_b_linux_runbook.md b/agents/dary_heap/thread_b_linux_runbook.md
new file mode 100644
index 0000000..4669eb5
--- /dev/null
+++ b/agents/dary_heap/thread_b_linux_runbook.md
@@ -0,0 +1,144 @@
+# Thread B — Linux/GCC investigation runbook
+
+This runbook closes Thread B of `agents/csr_edge_value_perf_plan.md`.
+
+The MSVC side (Phases 1.1, 2) is complete and committed. The original
+Phase 4.3a "graph-v3 is +7 % to +37 % slower than BGL" gap was measured on
+**Linux/GCC** and has not been reproduced under MSVC `/O2 /Ob3`. We need
+to know whether the gap still exists on Linux/GCC at the current branch
+HEAD before deciding whether Phases 3-5 (interventions) should run.
+
+## Constraint: WSL has no PMU
+
+WSL2 does not expose the host's hardware performance counters, so:
+
+- ❌ No `perf stat -e cache-misses,L1-dcache-load-misses,LLC-load-misses`
+- ❌ No `perf record -e cycles -F 4000`
+- ❌ No Linux equivalent of VTune microarchitecture exploration
+- ✅ Wall-clock benchmarks work fine
+- ✅ `perf stat` for software events (`task-clock`, `instructions:u`,
+     `context-switches`, etc.) — covered by the helper script
+- ✅ `objdump --demangle` — full disassembly comparison against MSVC
+
+Everything that *does* need PMU has been pre-collected on the Windows
+side and lives in `artifacts/perf/msvc_profile/`. The Linux side compares
+against it directly.
+
+## Pre-collected MSVC reference (`artifacts/perf/msvc_profile/`)
+
+| File | What it gives the Linux comparison |
+|---|---|
+| `wallclock_baseline.json` | 96 rows (24 benchmarks × 4 aggregates) on `windows-msvc-profile`. Run the same filter on Linux and use `bench_compare.py` to diff. |
+| `hotspots.csv` | VTune software-mode hotspots top-N (function-level CPU time). Linux reproduces this with `perf record --call-graph=fp -F 999` (no PMU needed). |
+| `callstacks.csv` | VTune callstack tree. Linux gets the same shape from `perf script` after a software-event `perf record`. |
+| `sift_down_csr_idx{2,4,8}.asm` | Per-arity inlined heap-sift body. Each is ~190 lines, 5-insn-per-comparison shape. |
+| `sift_down_vov_idx4.asm` | VoV variant for control. |
+| `sift_up_csr_idx4.asm` | sift_up_ counterpart (~110 lines). |
+| `dijkstra_csr_idx{2,4,8}.asm` | The actual Dijkstra-with-relax-loop body, ~206 lines each. |
+| `dijkstra_bgl_csr.asm` | BGL's `dijkstra_shortest_paths_no_color_map_no_init` for `compressed_sparse_row_graph`. **505 lines on MSVC.** Compare line counts and per-edge instruction count vs graph-v3's 206. |
+| `bgl_dary_sift_down_csr.asm` | BGL's `preserve_heap_property_down`, ~299 lines. Compare against graph-v3's `sift_down_csr_idx4.asm` (184 lines). |
+| `bgl_dary_sift_up_csr.asm` | BGL's `preserve_heap_property_up`, ~204 lines. |
+| `container_value_fn.asm` | graph-v3's value-function adapter. |
+
+### The size signal
+
+MSVC line counts at `/O2 /Ob3 /Zi`:
+
+```
+graph-v3 dijkstra body (Idx4)           206 lines
+BGL      dijkstra body (CSR)            505 lines    (~2.5x)
+
+graph-v3 sift_down_ (Idx4)              184 lines
+BGL      preserve_heap_property_down    299 lines    (~1.6x)
+```
+
+This is consistent with graph-v3 being 34-64 % faster than BGL on MSVC
+(measured in Phase 1.1). The Linux question is whether GCC produces a
+similar size ratio (in which case Linux/GCC should also see graph-v3
+ahead) or whether GCC compresses BGL more aggressively (which would
+explain the original 4.3a gap).
+
+## Setup (WSL)
+
+```bash
+# 1. Configure & build the Linux release preset
+cmake --preset linux-gcc-release \
+      -DDIJKSTRA_BENCH_BGL=ON \
+      -DBGL_INCLUDE_DIR=/path/to/boost
+cmake --build --preset linux-gcc-release -j
+
+# 2. Verify the benchmark binary exists
+file build/linux-gcc-release/benchmark/algorithms/benchmark_dijkstra
+```
+
+If `linux-gcc-release` doesn't exist as a preset on this branch, either
+add one mirroring `windows-msvc-release` or build manually:
+
+```bash
+mkdir -p build/linux-gcc-release && cd build/linux-gcc-release
+cmake -G Ninja -DCMAKE_BUILD_TYPE=Release \
+      -DBUILD_BENCHMARKS=ON \
+      -DDIJKSTRA_BENCH_BGL=ON -DBGL_INCLUDE_DIR=/path/to/boost \
+      ../..
+ninja benchmark_dijkstra
+```
+
+## Capture
+
+```bash
+# Drives steps 1, 2, 3 below in order.
+bash scripts/perf/linux_gcc_capture.sh
+```
+
+The script does:
+
+1. **Wall-clock baseline** (`bench_run.py`, 5 reps median, taskset core 4).
+   Output: `artifacts/perf/linux_gcc/wallclock_baseline.json`.
+2. **`perf stat` software events** for the 4 canonical 100K benchmarks.
+   Output: `artifacts/perf/linux_gcc/perfstat_*.{stdout,stderr}`.
+3. **`objdump` per-symbol captures** mirroring the MSVC manifest.
+   Output: `artifacts/perf/linux_gcc/*.asm`.
+
+## Compare
+
+```bash
+# Toolchain wall-clock comparison
+python3 scripts/perf/bench_compare.py \
+  --baseline artifacts/perf/msvc_profile/wallclock_baseline.json \
+  --candidate artifacts/perf/linux_gcc/wallclock_baseline.json \
+  --label-baseline msvc --label-candidate gcc \
+  --threshold 5
+
+# Per-symbol size diff (one-liner)
+for f in artifacts/perf/msvc_profile/*.asm; do
+  base=$(basename "$f" .asm)
+  ml=$(wc -l < "$f")
+  gl=$(wc -l < "artifacts/perf/linux_gcc/$base.asm" 2>/dev/null || echo NA)
+  printf "%-30s  msvc=%4s  gcc=%4s\n" "$base" "$ml" "$gl"
+done
+```
+
+## Decision tree
+
+| Outcome | Verdict |
+|---------|---------|
+| graph-v3 wins on Linux too (≤ 0 % delta vs BGL) | Plan **closed**. Update `csr_edge_value_perf_plan.md` Phase 5. The Phase 4.3a gap was closed by the post-4.3a commits (5085c60, 7645a19, 1c871a8, aa95fe0). |
+| graph-v3 still slower than BGL on Linux (+5 % or more) | Original investigation **resumes**: run Phase 1.2 (perf-stat counters — software-only on WSL), Phase 1.3 (perf record), Phase 2 (objdump diff between graph-v3 and BGL). The pre-collected MSVC asm gives the codegen reference for what "tight" looks like on the comparable workload. |
+| Linux numbers are noisy (CV > 10 %) | Re-run with `--reps 9` and a quieter system. WSL on a busy Windows host is the most likely cause; pin to a single core (`taskset -c 0`) and disable Windows background services. |
+
+## Files added by this work
+
+| Path | Purpose |
+|------|---------|
+| `scripts/perf/sym_index.py`            | Cached dumpbin symbol index (Windows) |
+| `scripts/perf/disasm_func.py`          | Single-function disasm (Windows) |
+| `scripts/perf/find_func.py`            | Symbol search (Windows) |
+| `scripts/perf/capture_asm.py`          | Bulk dumpbin capture (Windows) |
+| `scripts/perf/objdump_capture.py`      | Bulk objdump capture (Linux) |
+| `scripts/perf/linux_gcc_capture.sh`    | Linux runbook driver |
+| `scripts/perf/bench_run.py`            | Cross-platform Google Benchmark wrapper |
+| `scripts/perf/bench_compare.py`        | JSON-diff into markdown |
+| `scripts/perf/vtune_top.py`            | VTune CSV parser |
+| `agents/perf_capture_manifest.txt`     | MSVC capture targets |
+| `agents/perf_capture_manifest_linux.txt` | GCC capture targets |
+| `artifacts/perf/msvc_profile/`         | Pre-collected MSVC reference (gitignored) |
diff --git a/agents/indexed_dary_heap_plan.md b/agents/indexed_dary_heap_plan.md
deleted file mode 100644
index 53a64d3..0000000
--- a/agents/indexed_dary_heap_plan.md
+++ /dev/null
@@ -1,319 +0,0 @@
-# Indexed d-ary Heap for Dijkstra & Prim — Plan
-
-This plan introduces a true decrease-key priority queue to replace the
-`std::priority_queue` lazy-deletion pattern currently used by Dijkstra
-(and likely useful for Prim's MST). The goal is to remove stale-pop
-overhead, reduce heap memory from O(E) to O(V), and bring visitor
-semantics in line with BGL.
-
-**Branch:** `indexed-dary-heap`
-
-**Invariant:** After every phase, `ctest` passes all existing tests. No
-phase may break the public API of `dijkstra_shortest_paths`,
-`dijkstra_shortest_distances`, or any algorithm that already uses
-`std::priority_queue` internally.
-
----
-
-## Conventions
-
-| Symbol | Meaning |
-|--------|---------|
-| **File** | Absolute path relative to repo root |
-| **Read** | Files the agent must read for context before editing |
-| **Create** | New files to create |
-| **Modify** | Existing files to edit |
-| **Verify** | Commands to run and expected outcomes |
-| **Commit** | Git commit message (conventional-commit style) |
-
----
-
-## Background
-
-### Current state
-
-`dijkstra_shortest_paths` uses `std::priority_queue<weighted_vertex>`
-with re-insertion when a vertex's distance improves. The recently
-added stale-pop skip:
-
-```cpp
-if (compare(distance(g, uid), w)) continue;
-```
-
-makes this correct and gives single-shot visitor semantics, but the
-heap can hold up to O(E) entries and every relaxed edge causes a
-push.
-
-### Target state
-
-A min-heap that:
-
-- Stores at most one entry per vertex (size ≤ V).
-- Supports `push`, `top`, `pop`, `decrease(vid)`, `contains(vid)`.
-- Looks up a vertex's current distance via the user-supplied
-  `DistanceFn` (so heap order tracks live distance).
-- Is parameterized on arity `d` (default `d = 4`, matching Boost's
-  `d_ary_heap_indirect`).
-- Uses an external position map (`vertex_id -> heap_index`) so that
-  `decrease` is O(log_d V).
-
-### Performance hypothesis
-
-| Workload | Expected change vs. current |
-|----------|-----------------------------|
-| Sparse graph, few re-relaxations | Small win (push count drops, log V vs log E) |
-| Dense graph, many re-relaxations | Large win (heap size O(V) vs O(E)) |
-| Mapped (associative) vertex containers | Win depends on position-map cost |
-
-Hypothesis must be confirmed by benchmarks (Phase 4) before declaring
-the new heap the default.
-
----
-
-## Phase 0 — Preparation (no code changes)
-
-### 0.1 Verify Baseline
-
-| Item | Detail |
-|------|--------|
-| **Action** | Confirm the full test suite is green on the branch base. |
-| **Verify** | `cd build/linux-gcc-debug && ctest --output-on-failure` — all tests pass |
-
-### 0.2 Capture Baseline Benchmarks
-
-| Item | Detail |
-|------|--------|
-| **Action** | Record current Dijkstra benchmark numbers. |
-| **Read** | `benchmark/algorithms/` for existing Dijkstra benchmarks |
-| **Verify** | Save numbers to `agents/indexed_dary_heap_baseline.md` (gitignored or committed as reference). If no Dijkstra benchmark exists, create one in 0.3. |
-
-### 0.3 Add Dijkstra Benchmark (if missing)
-
-| Item | Detail |
-|------|--------|
-| **Action** | Ensure a Google Benchmark target exercises Dijkstra over (a) sparse random graph, (b) dense random graph, (c) grid graph, each at multiple V sizes. |
-| **Create** | `benchmark/algorithms/benchmark_dijkstra.cpp` if not present |
-| **Verify** | Benchmark builds and produces stable numbers across runs (CV < 5%). |
-
----
-
-## Phase 1 — Indexed d-ary Heap Container
-
-### 1.1 Design Header
-
-| Item | Detail |
-|------|--------|
-| **Read** | `boost/libs/graph/include/boost/graph/detail/d_ary_heap.hpp` for reference |
-| **Create** | `include/graph/detail/indexed_dary_heap.hpp` |
-
-Sketch of the public interface:
-
-```cpp
-namespace graph::detail {
-
-// External-key, indirect-comparison d-ary heap.
-//
-// Key      : the user's vertex id type (must be usable as an index/lookup key)
-// DistanceFn: callable (key) -> Distance&  (or const Distance&)
-// Compare  : strict weak order over Distance values (min-heap if less<>)
-// PositionMap: random-access mapping key -> size_t (heap position) or NPOS
-// Arity    : children per node (default 4)
-template <
-    class Key,
-    class DistanceFn,
-    class Compare,
-    class PositionMap,
-    std::size_t Arity = 4,
-    class Allocator = std::allocator<Key>>
-class indexed_dary_heap {
-public:
-    static constexpr std::size_t npos = static_cast<std::size_t>(-1);
-
-    indexed_dary_heap(DistanceFn d, Compare c, PositionMap p, const Allocator& = {});
-
-    bool   empty() const noexcept;
-    size_t size()  const noexcept;
-
-    void   push(Key k);              // O(log_d N)
-    Key    top()   const;            // O(1)
-    void   pop();                    // O(d log_d N)
-    void   decrease(Key k);          // O(log_d N) — distance must already be lower
-    bool   contains(Key k) const;    // O(1)
-    void   clear();
-
-private:
-    std::vector<Key, Allocator> heap_;
-    DistanceFn   distance_;
-    Compare      compare_;
-    PositionMap  position_;          // heap stores positions back into here on every move
-
-    void sift_up_(size_t i);
-    void sift_down_(size_t i);
-    void place_(size_t i, Key k);    // writes heap_[i] = k AND position_[k] = i
-};
-
-} // namespace graph::detail
-```
-
-Notes:
-- `PositionMap` is a *concept-style* requirement: `size_t& operator()(Key)` or
-  similar. For index-based graphs, it can wrap a `std::vector<size_t>`. For
-  mapped graphs, it can wrap an `std::unordered_map`. Decision deferred to
-  1.3.
-- `DistanceFn` is the *same* function the user passes to
-  `dijkstra_shortest_paths`. The heap reads, never writes.
-- Comparator is `Compare`, applied to *distances* (not keys). Internally:
-  `compare_(distance_(a), distance_(b))`.
-
-### 1.2 Implement Core Operations
-
-| Item | Detail |
-|------|--------|
-| **Action** | Implement `push`, `pop`, `sift_up_`, `sift_down_`, `decrease`, `contains`, `clear`. Keep `place_` as the single point where positions are written, to avoid bookkeeping bugs. |
-| **Verify** | Unit-tests in 1.4 pass. |
-
-Key correctness rules:
-- Every assignment to `heap_[i]` must go through `place_` so `position_` stays in sync.
-- `decrease(k)` reads `position_(k)` then sifts up only — caller guarantees the new distance is no worse.
-- `pop()` swaps last → root, marks the popped key's position as `npos`, then sifts down.
-
-### 1.3 Position Map Adapter
-
-| Item | Detail |
-|------|--------|
-| **Create** | `include/graph/detail/heap_position_map.hpp` |
-| **Action** | Provide two adapters: <br> 1. `vector_position_map` — wraps a `std::vector<size_t>` indexed by integral key. <br> 2. `assoc_position_map` — wraps `std::unordered_map<Key, size_t>` for non-integral keys. <br> Both default-construct to `npos` semantics. |
-| **Verify** | Adapters compile with the heap. Covered by tests in 1.4. |
-
-### 1.4 Unit Tests
-
-| Item | Detail |
-|------|--------|
-| **Create** | `tests/common/test_indexed_dary_heap.cpp` |
-| **Action** | Cover: empty heap, single element, ascending/descending pushes, mixed push+pop, repeated `decrease`, `contains` before/after push/pop, both arity 2 and 4, custom comparator (max-heap), both position-map adapters. |
-| **Verify** | `ctest -R indexed_dary_heap` — all pass. |
-| **Commit** | `feat(detail): indexed d-ary heap with external position map` |
-
----
-
-## Phase 2 — Integrate into Dijkstra (opt-in)
-
-### 2.1 Add Heap-Selector Tag (or Template Parameter)
-
-| Item | Detail |
-|------|--------|
-| **Read** | `include/graph/algorithm/dijkstra_shortest_paths.hpp` |
-| **Modify** | Add an optional template parameter `Heap = use_default_heap` (a tag). When `use_default_heap`, behavior is unchanged. When `use_indexed_dary_heap<Arity>`, the new heap is used. |
-| **Verify** | Existing tests still pass (default path unchanged). |
-| **Commit** | `feat(dijkstra): add heap-selector template parameter` |
-
-Rationale: keeps the change additive and reversible. We can flip the
-default in a later phase once benchmarks confirm parity or improvement.
-
-### 2.2 Implementation Branch
-
-| Item | Detail |
-|------|--------|
-| **Modify** | Inside `dijkstra_shortest_paths`, dispatch to one of two inner implementations based on the `Heap` tag. Share the visitor / relax / source-seeding code via a small helper. |
-| **Action** | The indexed-heap implementation: <br> - Removes the stale-pop skip (no stale entries possible). <br> - Replaces re-push with `decrease` on the relax path. <br> - Removes `weighted_vertex` (heap stores ids only; distance is read live via `DistanceFn`). |
-| **Verify** | All existing Dijkstra tests pass under both code paths. Add a test variant that exercises each test with the indexed heap. |
-| **Commit** | `feat(dijkstra): indexed d-ary heap implementation path` |
-
-### 2.3 Visitor Semantics Audit
-
-| Item | Detail |
-|------|--------|
-| **Action** | Confirm `on_examine_vertex` and `on_finish_vertex` fire exactly once per reachable vertex on the indexed-heap path. Confirm `on_edge_relaxed` and `on_edge_not_relaxed` counts match Boost's behavior. |
-| **Verify** | Add a counting-visitor test that asserts call counts on a reference graph with both heap paths. |
-| **Commit** | `test(dijkstra): visitor call-count parity across heap paths` |
-
----
-
-## Phase 3 — Mapped-Container Support
-
-### 3.1 Position Map for Mapped Graphs
-
-| Item | Detail |
-|------|--------|
-| **Read** | `agents/map_container_strategy.md`, `agents/map_container_plan.md` |
-| **Action** | Wire the `assoc_position_map` adapter into the indexed-heap dispatch when `vertex_id_t<G>` is non-integral or the graph is a mapped container. Decision criterion to be documented. |
-| **Verify** | Run the Dijkstra test suite against mapped graph types with the indexed heap. |
-| **Commit** | `feat(dijkstra): indexed-heap support for mapped containers` |
-
-### 3.2 Vertex-Property-Map Position Storage (optional)
-
-| Item | Detail |
-|------|--------|
-| **Action** | Investigate whether the position map can live inside the graph as a vertex property map (matching Boost's `vertex_property_map_generator`). Spike only — implement only if it removes a meaningful allocation on hot paths. |
-| **Verify** | Benchmark before/after on mapped graphs. |
-| **Commit** | `feat(dijkstra): in-graph position map for mapped containers` (only if accepted) |
-
----
-
-## Phase 4 — Benchmarks & Default Selection
-
-### 4.1 Comparative Benchmarks
-
-| Item | Detail |
-|------|--------|
-| **Action** | Run the Phase 0.3 benchmarks against (a) `priority_queue` path, (b) `indexed_dary_heap<2>`, (c) `indexed_dary_heap<4>`, (d) `indexed_dary_heap<8>`. Record results in `agents/indexed_dary_heap_results.md`. |
-| **Verify** | Numbers stable across at least 3 runs. |
-
-### 4.2 Decide Default
-
-| Item | Detail |
-|------|--------|
-| **Action** | Based on results: <br> - If indexed `d=4` wins or ties on every workload, make it the default. <br> - If it loses on sparse small graphs, keep `priority_queue` default and document the selector. <br> - If results are mixed, consider a heuristic dispatch (e.g., based on E/V ratio) — but only with strong evidence. |
-| **Modify** | Default heap parameter, plus a CHANGELOG entry. |
-| **Verify** | Full test suite still green. Benchmarks regenerated. |
-| **Commit** | `perf(dijkstra): switch default heap to indexed d-ary` (or document why not) |
-
----
-
-## Phase 5 — Reuse for Prim's MST (optional follow-up)
-
-### 5.1 Audit Prim's Implementation
-
-| Item | Detail |
-|------|--------|
-| **Read** | `include/graph/algorithm/mst.hpp` (or wherever Prim lives) |
-| **Action** | Identify whether Prim has the same lazy-deletion pattern. If yes, plan a parallel migration. |
-| **Verify** | N/A (planning only). |
-
-### 5.2 Apply Indexed Heap to Prim
-
-| Item | Detail |
-|------|--------|
-| **Action** | Mirror Phase 2 for Prim: opt-in selector → integrate → benchmark → switch default. |
-| **Verify** | MST test suite green. |
-| **Commit** | `perf(mst): indexed d-ary heap path for Prim` |
-
----
-
-## Open Questions
-
-1. **PositionMap ownership.** Owned by the heap (simplest, allocates per call), or
-   passed in (zero-allocation for repeated calls, more API surface)? Default to
-   owned-by-heap for the first cut.
-2. **Arity as runtime vs compile-time.** Compile-time only — runtime would lose
-   the constexpr unrolling that justifies d-ary heaps in the first place.
-3. **`Compare` indirection cost.** The heap calls `compare_(distance_(a), distance_(b))`
-   twice per sift-down step (one comparator call per child + one against the parent).
-   For trivial `DistanceFn` (vector lookup) this should inline; verify in benchmarks.
-4. **Visitor `on_examine_vertex` semantics on multi-source seeding.** The current
-   multi-source code seeds N vertices into the queue. With the indexed heap, the
-   first pop of each source is the settled pop (no re-pushes possible since
-   distance is already 0). Confirm visitor semantics are unchanged.
-5. **Should the new heap live in `graph/detail/` or be promoted to `graph/container/`?**
-   Defer the decision — start in `detail/` and promote only if external code finds it
-   useful.
-
----
-
-## Out of Scope
-
-- Fibonacci heap, pairing heap, or radix heap implementations.
-- Replacing other algorithms' priority queues (BFS variants, A*, etc.).
-- Changing public algorithm signatures beyond adding the optional `Heap` template
-  parameter.
-- Parallel / concurrent heap variants.
diff --git a/benchmark/algorithms/CMakeLists.txt b/benchmark/algorithms/CMakeLists.txt
index 0ed3c32..8883b0f 100644
--- a/benchmark/algorithms/CMakeLists.txt
+++ b/benchmark/algorithms/CMakeLists.txt
@@ -1,20 +1,111 @@
 # Algorithm Benchmarks CMakeLists.txt
 # Performance benchmarks for graph algorithms
 
-# Benchmark executables will be added here as algorithms are implemented
-# Example:
+# ---------------------------------------------------------------------------
+# Dijkstra benchmark (Phase 0 — baseline capture)
+# ---------------------------------------------------------------------------
+
+add_executable(benchmark_dijkstra
+    benchmark_dijkstra.cpp
+)
+
+target_link_libraries(benchmark_dijkstra
+    PRIVATE
+        graph::graph3
+        benchmark::benchmark
+)
+
+target_include_directories(benchmark_dijkstra
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+# Optional large-scale tier (V = 1 000 000): cmake -DDIJKSTRA_BENCH_LARGE=ON
+option(DIJKSTRA_BENCH_LARGE "Enable V=1000000 benchmark tier" OFF)
+if(DIJKSTRA_BENCH_LARGE)
+    target_compile_definitions(benchmark_dijkstra PRIVATE DIJKSTRA_BENCH_LARGE)
+endif()
+
+# Optional Boost.Graph (BGL) comparison benchmarks (Phase 4.3).
 #
-# add_executable(benchmark_shortest_path
-#     benchmark_shortest_path.cpp
-# )
+# BGL is header-only, so we only need an include directory containing
+# <boost/graph/dijkstra_shortest_paths.hpp>. No linking required.
 #
-# target_link_libraries(benchmark_shortest_path
-#     PRIVATE
-#         graph::graph3
-#         benchmark::benchmark
-# )
+# Resolution order for the include directory (first match wins):
+#   1. Explicit cache var:        cmake -DBGL_INCLUDE_DIR=/path/to/boost
+#   2. Environment variable:      $env:BGL_INCLUDE_DIR or $BGL_INCLUDE_DIR
+#   3. Environment variable:      $env:BOOST_ROOT     or $BOOST_ROOT
+#   4. Per-platform defaults (see DIJKSTRA_BENCH_BGL_DEFAULT_PATHS below)
 #
-# add_test(NAME benchmark_shortest_path
-#     COMMAND benchmark_shortest_path --benchmark_min_time=0.1)
+# Typical invocations:
+#   cmake -DDIJKSTRA_BENCH_BGL=ON                                  # auto-discover
+#   cmake -DDIJKSTRA_BENCH_BGL=ON -DBGL_INCLUDE_DIR=D:/dev_graph/boost
+#   $env:BGL_INCLUDE_DIR = "D:/dev_graph/boost"; cmake -DDIJKSTRA_BENCH_BGL=ON
+option(DIJKSTRA_BENCH_BGL "Enable BGL comparison benchmarks (auto-discovers Boost; override with BGL_INCLUDE_DIR)" OFF)
+set(BGL_INCLUDE_DIR "" CACHE PATH "Boost include directory for BGL benchmarks (overrides auto-discovery)")
+
+# Per-platform default search paths. Customise for new environments by either
+# setting BGL_INCLUDE_DIR / BOOST_ROOT in the environment or by appending here.
+if(WIN32)
+    set(DIJKSTRA_BENCH_BGL_DEFAULT_PATHS
+        "D:/dev_graph/boost"
+        "C:/dev_graph/boost"
+        "C:/boost"
+        "C:/local/boost"
+    )
+elseif(APPLE)
+    set(DIJKSTRA_BENCH_BGL_DEFAULT_PATHS
+        "$ENV{HOME}/dev_graph/boost"
+        "/opt/homebrew/include"
+        "/usr/local/include"
+    )
+else() # Linux / other Unix
+    set(DIJKSTRA_BENCH_BGL_DEFAULT_PATHS
+        "$ENV{HOME}/dev_graph/boost"
+        "/usr/include"
+        "/usr/local/include"
+    )
+endif()
+
+if(DIJKSTRA_BENCH_BGL)
+    set(_bgl_resolved "")
+    set(_bgl_marker  "boost/graph/dijkstra_shortest_paths.hpp")
+
+    # Build candidate list in priority order.
+    set(_bgl_candidates "")
+    if(BGL_INCLUDE_DIR)
+        list(APPEND _bgl_candidates "${BGL_INCLUDE_DIR}")
+    endif()
+    if(DEFINED ENV{BGL_INCLUDE_DIR} AND NOT "$ENV{BGL_INCLUDE_DIR}" STREQUAL "")
+        list(APPEND _bgl_candidates "$ENV{BGL_INCLUDE_DIR}")
+    endif()
+    if(DEFINED ENV{BOOST_ROOT} AND NOT "$ENV{BOOST_ROOT}" STREQUAL "")
+        list(APPEND _bgl_candidates "$ENV{BOOST_ROOT}")
+    endif()
+    list(APPEND _bgl_candidates ${DIJKSTRA_BENCH_BGL_DEFAULT_PATHS})
+
+    foreach(_cand IN LISTS _bgl_candidates)
+        if(_cand AND EXISTS "${_cand}/${_bgl_marker}")
+            set(_bgl_resolved "${_cand}")
+            break()
+        endif()
+    endforeach()
+
+    if(NOT _bgl_resolved)
+        message(FATAL_ERROR
+            "DIJKSTRA_BENCH_BGL=ON but no Boost include directory was found containing "
+            "${_bgl_marker}.\n"
+            "Tried (in order): ${_bgl_candidates}\n"
+            "Set -DBGL_INCLUDE_DIR=/path/to/boost, $env:BGL_INCLUDE_DIR, or $env:BOOST_ROOT, "
+            "or append to DIJKSTRA_BENCH_BGL_DEFAULT_PATHS in benchmark/algorithms/CMakeLists.txt.")
+    endif()
+
+    target_include_directories(benchmark_dijkstra SYSTEM PRIVATE "${_bgl_resolved}")
+    target_compile_definitions(benchmark_dijkstra PRIVATE BENCH_BGL)
+    message(STATUS "BGL comparison benchmarks enabled (using ${_bgl_resolved})")
+endif()
 
-# Note: Uncomment and add benchmark executables as algorithms are implemented
+# Register with CTest using a short minimum time so CI stays fast.
+# For proper baseline capture use: ./benchmark_dijkstra --benchmark_min_time=1.0
+add_test(NAME benchmark_dijkstra
+    COMMAND benchmark_dijkstra --benchmark_min_time=0.1)
diff --git a/benchmark/algorithms/benchmark_dijkstra.cpp b/benchmark/algorithms/benchmark_dijkstra.cpp
new file mode 100644
index 0000000..c44a7f5
--- /dev/null
+++ b/benchmark/algorithms/benchmark_dijkstra.cpp
@@ -0,0 +1,346 @@
+/**
+ * @file benchmark_dijkstra.cpp
+ * @brief Google Benchmark suite for dijkstra_shortest_distances.
+ *
+ * Covers four graph topologies × two containers (CSR and vov) across a
+ * scale sweep of V ∈ {1 000, 10 000, 100 000}.  Graph construction and
+ * distance-vector initialisation are excluded from the timed region via
+ * state.PauseTiming() / state.ResumeTiming() — only the Dijkstra call
+ * itself is measured.
+ *
+ * Benchmark naming convention:
+ *   BM_Dijkstra_<Container>_<Topology>           — default heap (priority_queue)
+ *   BM_Dijkstra_<Container>_<Topology>_Idx<D>    — indexed d-ary heap, arity D
+ *   Container : CSR  (compressed_graph)
+ *               VoV  (dynamic_graph / vov)
+ *   Topology  : ER_Sparse   Erdős–Rényi, E/V ≈ 8
+ *               Grid        2D grid (bidirectional, E/V ≈ 4)
+ *               BA          Barabási–Albert, m=4, E/V ≈ 8
+ *               Path        Path graph, E/V = 1 (minimum decrease-key)
+ *
+ * Compile-time macro DIJKSTRA_BENCH_LARGE enables the 1 000 000-vertex
+ * tier (disabled by default to keep CI times reasonable).
+ *
+ * Phase 0.4 baseline results: agents/indexed_dary_heap_baseline.md
+ * Phase 4.1 comparative results: agents/indexed_dary_heap_results.md
+ */
+
+#include <benchmark/benchmark.h>
+
+#include <graph/algorithm/dijkstra_shortest_paths.hpp>
+#include <graph/algorithm/traversal_common.hpp>
+#include <graph/graph.hpp>
+
+#include "dijkstra_fixtures.hpp"
+
+#ifdef BENCH_BGL
+#  include "bgl_dijkstra_fixtures.hpp"
+#endif
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <limits>
+#include <vector>
+
+// ---------------------------------------------------------------------------
+// Internal helpers
+// ---------------------------------------------------------------------------
+
+namespace {
+
+/// Resize and initialise the distance vector to +∞.  Called once before
+/// the benchmark loop; the per-iteration reset uses std::fill (cheaper
+/// than reallocation) inside the paused region.
+template <class G>
+void init_dist(const G& g, std::vector<double>& dist) {
+  const std::size_t n = graph::num_vertices(g);
+  dist.assign(n, std::numeric_limits<double>::max());
+}
+
+/// Weight function: return the edge value stored in the graph.
+constexpr auto weight_fn = [](const auto& g, const auto& uv) {
+  return graph::edge_value(g, uv);
+};
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// Macro: define a Dijkstra benchmark for a given container, graph builder,
+//        and heap tag.
+//
+// Parameters:
+//   NAME       — benchmark function name  (e.g. BM_Dijkstra_CSR_ER_Sparse)
+//   GRAPH_T    — graph container type
+//   MAKE_FN    — graph::benchmark::make_csr or make_vov
+//   EDGE_EXPR  — expression producing edge_list, may use vertex_id_t n
+//   N_EXPR     — expression for num_vertices to pass to MAKE_FN (usually n)
+//   HEAP_TAG   — use_default_heap{} or use_indexed_dary_heap<D>{}
+// ---------------------------------------------------------------------------
+
+#define DEFINE_DIJKSTRA_BM(NAME, GRAPH_T, MAKE_FN, EDGE_EXPR, N_EXPR, HEAP_TAG)     \
+  static void NAME(benchmark::State& state) {                                        \
+    const auto n = static_cast<graph::benchmark::vertex_id_t>(state.range(0));       \
+    /* Build graph outside the timed loop */                                         \
+    const auto edges = (EDGE_EXPR);                                                  \
+    GRAPH_T    g     = graph::benchmark::MAKE_FN(edges, (N_EXPR));                   \
+    std::vector<double> dist;                                                         \
+    init_dist(g, dist);                                                              \
+    for (auto _ : state) {                                                           \
+      /* Exclude distance-reset from the measurement */                              \
+      state.PauseTiming();                                                           \
+      std::fill(dist.begin(), dist.end(), std::numeric_limits<double>::max());       \
+      state.ResumeTiming();                                                          \
+      graph::dijkstra_shortest_distances(                                            \
+            g, graph::benchmark::vertex_id_t{0}, graph::container_value_fn(dist),   \
+            weight_fn, graph::empty_visitor{},                                       \
+            std::less<double>{}, std::plus<double>{},                                \
+            HEAP_TAG, std::allocator<std::byte>{});                                  \
+      benchmark::DoNotOptimize(dist.data());                                        \
+    }                                                                                \
+    state.SetComplexityN(state.range(0));                                            \
+  }
+
+// Convenience shorthands for the four heap variants.
+#define DEF_BM_DEFAULT(NAME, GT, MK, EE, NE)  DEFINE_DIJKSTRA_BM(NAME, GT, MK, EE, NE, graph::use_default_heap{})
+#define DEF_BM_IDX2(NAME, GT, MK, EE, NE)     DEFINE_DIJKSTRA_BM(NAME, GT, MK, EE, NE, graph::use_indexed_dary_heap<2>{})
+#define DEF_BM_IDX4(NAME, GT, MK, EE, NE)     DEFINE_DIJKSTRA_BM(NAME, GT, MK, EE, NE, graph::use_indexed_dary_heap<4>{})
+#define DEF_BM_IDX8(NAME, GT, MK, EE, NE)     DEFINE_DIJKSTRA_BM(NAME, GT, MK, EE, NE, graph::use_indexed_dary_heap<8>{})
+
+// ---------------------------------------------------------------------------
+// Erdős–Rényi, E/V ≈ 8  (p = 8/n)
+// ---------------------------------------------------------------------------
+
+#define ER_EDGES(n)   graph::benchmark::erdos_renyi(n, 8.0 / n)
+#define GRID_SQRT(n)  static_cast<graph::benchmark::vertex_id_t>(std::sqrt(static_cast<double>(n)))
+
+DEF_BM_DEFAULT(BM_Dijkstra_CSR_ER_Sparse,       graph::benchmark::csr_graph_t, make_csr, ER_EDGES(n), n)
+DEF_BM_IDX2   (BM_Dijkstra_CSR_ER_Sparse_Idx2,  graph::benchmark::csr_graph_t, make_csr, ER_EDGES(n), n)
+DEF_BM_IDX4   (BM_Dijkstra_CSR_ER_Sparse_Idx4,  graph::benchmark::csr_graph_t, make_csr, ER_EDGES(n), n)
+DEF_BM_IDX8   (BM_Dijkstra_CSR_ER_Sparse_Idx8,  graph::benchmark::csr_graph_t, make_csr, ER_EDGES(n), n)
+
+DEF_BM_DEFAULT(BM_Dijkstra_VoV_ER_Sparse,       graph::benchmark::vov_graph_t, make_vov, ER_EDGES(n), n)
+DEF_BM_IDX4   (BM_Dijkstra_VoV_ER_Sparse_Idx4,  graph::benchmark::vov_graph_t, make_vov, ER_EDGES(n), n)
+
+BENCHMARK(BM_Dijkstra_CSR_ER_Sparse)      ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_ER_Sparse_Idx2)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_ER_Sparse_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_ER_Sparse_Idx8)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_VoV_ER_Sparse)     ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_VoV_ER_Sparse_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+
+// ---------------------------------------------------------------------------
+// 2D grid  (rows = cols = sqrt(n), E/V ≈ 4)
+// ---------------------------------------------------------------------------
+
+DEF_BM_DEFAULT(BM_Dijkstra_CSR_Grid,      graph::benchmark::csr_graph_t, make_csr,
+               graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)), GRID_SQRT(n) * GRID_SQRT(n))
+DEF_BM_IDX2   (BM_Dijkstra_CSR_Grid_Idx2, graph::benchmark::csr_graph_t, make_csr,
+               graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)), GRID_SQRT(n) * GRID_SQRT(n))
+DEF_BM_IDX4   (BM_Dijkstra_CSR_Grid_Idx4, graph::benchmark::csr_graph_t, make_csr,
+               graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)), GRID_SQRT(n) * GRID_SQRT(n))
+DEF_BM_IDX8   (BM_Dijkstra_CSR_Grid_Idx8, graph::benchmark::csr_graph_t, make_csr,
+               graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)), GRID_SQRT(n) * GRID_SQRT(n))
+
+DEF_BM_DEFAULT(BM_Dijkstra_VoV_Grid,      graph::benchmark::vov_graph_t, make_vov,
+               graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)), GRID_SQRT(n) * GRID_SQRT(n))
+DEF_BM_IDX4   (BM_Dijkstra_VoV_Grid_Idx4, graph::benchmark::vov_graph_t, make_vov,
+               graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)), GRID_SQRT(n) * GRID_SQRT(n))
+
+BENCHMARK(BM_Dijkstra_CSR_Grid)      ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_Grid_Idx2)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_Grid_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_Grid_Idx8)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_VoV_Grid)     ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_VoV_Grid_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+
+// ---------------------------------------------------------------------------
+// Barabási–Albert, m=4  (E/V ≈ 8, heavy hub traffic)
+// ---------------------------------------------------------------------------
+
+DEF_BM_DEFAULT(BM_Dijkstra_CSR_BA,      graph::benchmark::csr_graph_t, make_csr, graph::benchmark::barabasi_albert(n, 4), n)
+DEF_BM_IDX2   (BM_Dijkstra_CSR_BA_Idx2, graph::benchmark::csr_graph_t, make_csr, graph::benchmark::barabasi_albert(n, 4), n)
+DEF_BM_IDX4   (BM_Dijkstra_CSR_BA_Idx4, graph::benchmark::csr_graph_t, make_csr, graph::benchmark::barabasi_albert(n, 4), n)
+DEF_BM_IDX8   (BM_Dijkstra_CSR_BA_Idx8, graph::benchmark::csr_graph_t, make_csr, graph::benchmark::barabasi_albert(n, 4), n)
+
+DEF_BM_DEFAULT(BM_Dijkstra_VoV_BA,      graph::benchmark::vov_graph_t, make_vov, graph::benchmark::barabasi_albert(n, 4), n)
+DEF_BM_IDX4   (BM_Dijkstra_VoV_BA_Idx4, graph::benchmark::vov_graph_t, make_vov, graph::benchmark::barabasi_albert(n, 4), n)
+
+BENCHMARK(BM_Dijkstra_CSR_BA)      ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_BA_Idx2)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_BA_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_BA_Idx8)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_VoV_BA)     ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_VoV_BA_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+
+// ---------------------------------------------------------------------------
+// Path graph  (E/V = 1, minimum decrease-key)
+// ---------------------------------------------------------------------------
+
+DEF_BM_DEFAULT(BM_Dijkstra_CSR_Path,      graph::benchmark::csr_graph_t, make_csr, graph::benchmark::path_graph(n), n)
+DEF_BM_IDX2   (BM_Dijkstra_CSR_Path_Idx2, graph::benchmark::csr_graph_t, make_csr, graph::benchmark::path_graph(n), n)
+DEF_BM_IDX4   (BM_Dijkstra_CSR_Path_Idx4, graph::benchmark::csr_graph_t, make_csr, graph::benchmark::path_graph(n), n)
+DEF_BM_IDX8   (BM_Dijkstra_CSR_Path_Idx8, graph::benchmark::csr_graph_t, make_csr, graph::benchmark::path_graph(n), n)
+
+DEF_BM_DEFAULT(BM_Dijkstra_VoV_Path,      graph::benchmark::vov_graph_t, make_vov, graph::benchmark::path_graph(n), n)
+DEF_BM_IDX4   (BM_Dijkstra_VoV_Path_Idx4, graph::benchmark::vov_graph_t, make_vov, graph::benchmark::path_graph(n), n)
+
+BENCHMARK(BM_Dijkstra_CSR_Path)      ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_Path_Idx2)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_Path_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_Path_Idx8)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_VoV_Path)     ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_VoV_Path_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+
+// ---------------------------------------------------------------------------
+// Optional large-scale tier  (V = 1 000 000)
+// Enable with: cmake -DDIJKSTRA_BENCH_LARGE=ON ...
+// ---------------------------------------------------------------------------
+
+#ifdef DIJKSTRA_BENCH_LARGE
+BENCHMARK(BM_Dijkstra_CSR_ER_Sparse)      ->Arg(1'000'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_ER_Sparse_Idx4)->Arg(1'000'000)->Complexity();
+BENCHMARK(BM_Dijkstra_VoV_ER_Sparse)     ->Arg(1'000'000)->Complexity();
+BENCHMARK(BM_Dijkstra_VoV_ER_Sparse_Idx4)->Arg(1'000'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_BA)      ->Arg(1'000'000)->Complexity();
+BENCHMARK(BM_Dijkstra_CSR_BA_Idx4)->Arg(1'000'000)->Complexity();
+#endif
+
+// ---------------------------------------------------------------------------
+// BGL comparison benchmarks (Phase 4.3)
+//
+// Enabled with -DDIJKSTRA_BENCH_BGL=ON -DBGL_INCLUDE_DIR=/path/to/boost.
+// Topologically identical graphs are built from the same edge_list, so the
+// BGL and graph-v3 numbers can be compared directly.
+//
+// BGL uses dijkstra_shortest_paths_no_color_map_no_init for fairness:
+// caller pre-initialises distances; no color-map allocation inside the
+// timed region (matches graph-v3's no-init semantics).
+// ---------------------------------------------------------------------------
+
+#ifdef BENCH_BGL
+
+#define DEFINE_BGL_DIJKSTRA_BM(NAME, GRAPH_T, MAKE_FN, EDGE_EXPR, N_EXPR)            \
+  static void NAME(benchmark::State& state) {                                        \
+    const auto n = static_cast<graph::benchmark::vertex_id_t>(state.range(0));       \
+    const auto edges = (EDGE_EXPR);                                                  \
+    GRAPH_T    g     = graph::benchmark::MAKE_FN(edges, (N_EXPR));                   \
+    std::vector<double> dist(boost::num_vertices(g),                                 \
+                              std::numeric_limits<double>::max());                   \
+    for (auto _ : state) {                                                           \
+      state.PauseTiming();                                                           \
+      std::fill(dist.begin(), dist.end(), std::numeric_limits<double>::max());       \
+      dist[0] = 0.0;                                                                  \
+      state.ResumeTiming();                                                          \
+      graph::benchmark::run_bgl_dijkstra(g, 0u, dist);                                \
+      benchmark::DoNotOptimize(dist.data());                                         \
+    }                                                                                \
+    state.SetComplexityN(state.range(0));                                            \
+  }
+
+DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_CSR_ER_Sparse,
+                       graph::benchmark::bgl_csr_graph_t, make_bgl_csr,
+                       ER_EDGES(n), n)
+DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_CSR_Grid,
+                       graph::benchmark::bgl_csr_graph_t, make_bgl_csr,
+                       graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)),
+                       GRID_SQRT(n) * GRID_SQRT(n))
+DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_CSR_BA,
+                       graph::benchmark::bgl_csr_graph_t, make_bgl_csr,
+                       graph::benchmark::barabasi_albert(n, 4), n)
+DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_CSR_Path,
+                       graph::benchmark::bgl_csr_graph_t, make_bgl_csr,
+                       graph::benchmark::path_graph(n), n)
+
+DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_Adj_ER_Sparse,
+                       graph::benchmark::bgl_adj_graph_t, make_bgl_adj,
+                       ER_EDGES(n), n)
+DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_Adj_Grid,
+                       graph::benchmark::bgl_adj_graph_t, make_bgl_adj,
+                       graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)),
+                       GRID_SQRT(n) * GRID_SQRT(n))
+DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_Adj_BA,
+                       graph::benchmark::bgl_adj_graph_t, make_bgl_adj,
+                       graph::benchmark::barabasi_albert(n, 4), n)
+DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_Adj_Path,
+                       graph::benchmark::bgl_adj_graph_t, make_bgl_adj,
+                       graph::benchmark::path_graph(n), n)
+
+BENCHMARK(BM_Dijkstra_BGL_CSR_ER_Sparse)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_BGL_CSR_Grid)     ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_BGL_CSR_BA)       ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_BGL_CSR_Path)     ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_BGL_Adj_ER_Sparse)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_BGL_Adj_Grid)     ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_BGL_Adj_BA)       ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+BENCHMARK(BM_Dijkstra_BGL_Adj_Path)     ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity();
+
+// ---------------------------------------------------------------------------
+// Cross-library distance correctness check.
+//
+// Runs once at startup before benchmarks begin. For each topology, builds
+// the same edge list with both libraries, runs Dijkstra from vertex 0,
+// and asserts the resulting distance vectors match. Differences indicate
+// a wrapper or property-map bug rather than a real performance signal.
+// ---------------------------------------------------------------------------
+
+namespace {
+
+bool check_bgl_distance_parity() {
+  using namespace graph::benchmark;
+  constexpr vertex_id_t n = 1024;
+
+  auto check = [&](const auto& edges) {
+    const std::size_t nv = static_cast<std::size_t>(n);
+
+    // graph-v3
+    auto g3 = make_csr(edges, n);
+    std::vector<double> d3(nv, std::numeric_limits<double>::max());
+    graph::dijkstra_shortest_distances(
+          g3, vertex_id_t{0}, graph::container_value_fn(d3),
+          weight_fn, graph::empty_visitor{},
+          std::less<double>{}, std::plus<double>{},
+          graph::use_default_heap{}, std::allocator<std::byte>{});
+
+    // BGL
+    auto g_bgl = make_bgl_csr(edges, n);
+    std::vector<double> d_bgl(nv, std::numeric_limits<double>::max());
+    d_bgl[0] = 0.0;
+    run_bgl_dijkstra(g_bgl, vertex_id_t{0}, d_bgl);
+
+    if (d3.size() != d_bgl.size()) return false;
+    for (std::size_t i = 0; i < nv; ++i) {
+      // Both write infinity for unreachable; otherwise distances must agree
+      // exactly (same edge weights, same compare/combine).
+      if (d3[i] != d_bgl[i]) return false;
+    }
+    return true;
+  };
+
+  bool ok = true;
+  ok &= check(erdos_renyi(n, 8.0 / static_cast<double>(n)));
+  ok &= check(barabasi_albert(n, 4));
+  ok &= check(path_graph(n));
+  return ok;
+}
+
+const bool kBglParityChecked = [] {
+  if (!check_bgl_distance_parity()) {
+    std::fprintf(stderr,
+                 "FATAL: BGL vs graph-v3 distance parity check failed; "
+                 "benchmarks would compare incorrect results.\n");
+    std::abort();
+  }
+  return true;
+}();
+
+} // namespace
+
+#endif // BENCH_BGL
+
+// ---------------------------------------------------------------------------
+// Entry point
+// ---------------------------------------------------------------------------
+
+BENCHMARK_MAIN();
diff --git a/benchmark/algorithms/bgl_dijkstra_fixtures.hpp b/benchmark/algorithms/bgl_dijkstra_fixtures.hpp
new file mode 100644
index 0000000..1578c43
--- /dev/null
+++ b/benchmark/algorithms/bgl_dijkstra_fixtures.hpp
@@ -0,0 +1,132 @@
+/**
+ * @file bgl_dijkstra_fixtures.hpp
+ * @brief Boost.Graph (BGL) container builders for Dijkstra comparison benchmarks.
+ *
+ * Companion to dijkstra_fixtures.hpp. Builds BGL graphs from the *same*
+ * edge_list produced by the synthetic generators, so graph-v3 and BGL
+ * benchmarks operate on topologically identical graphs.
+ *
+ *   compressed_graph (CSR)        ↔ boost::compressed_sparse_row_graph
+ *   dynamic_graph    (vov)        ↔ boost::adjacency_list<vecS, vecS, directedS, …>
+ *
+ * Only compiled when BENCH_BGL is defined (set by CMake when the BGL include
+ * directory is available); see benchmark/algorithms/CMakeLists.txt.
+ *
+ * Phase 4.3 — sanity-check comparison only. BGL distance results must match
+ * graph-v3 distance results bit-for-bit on the same source vertex; the
+ * benchmarks assert this once at startup before timing starts.
+ */
+
+#pragma once
+
+#include "dijkstra_fixtures.hpp"
+
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/compressed_sparse_row_graph.hpp>
+#include <boost/graph/dijkstra_shortest_paths.hpp>
+#include <boost/graph/dijkstra_shortest_paths_no_color_map.hpp>
+#include <boost/property_map/property_map.hpp>
+
+#include <limits>
+#include <utility>
+#include <vector>
+
+namespace graph::benchmark {
+
+// ---------------------------------------------------------------------------
+// Bundled edge property: a single double weight, mirroring graph-v3 layout.
+// ---------------------------------------------------------------------------
+
+struct bgl_edge_prop {
+  double weight = 0.0;
+};
+
+// ---------------------------------------------------------------------------
+// BGL container types
+// ---------------------------------------------------------------------------
+
+/// CSR equivalent. directedS, no vertex bundle, edge bundle = bgl_edge_prop.
+/// vertex_index is implicit (vecS storage).
+using bgl_csr_graph_t =
+      boost::compressed_sparse_row_graph<boost::directedS,
+                                         boost::no_property,
+                                         bgl_edge_prop,
+                                         boost::no_property,
+                                         vertex_id_t,
+                                         vertex_id_t>;
+
+/// adjacency_list equivalent of dynamic_graph<vov, …>: vector of vector,
+/// directed, edge bundle stores the weight.
+using bgl_adj_graph_t =
+      boost::adjacency_list<boost::vecS, boost::vecS, boost::directedS,
+                            boost::no_property, bgl_edge_prop>;
+
+// ---------------------------------------------------------------------------
+// Container builders
+// ---------------------------------------------------------------------------
+
+/// Build a BGL CSR graph from a graph-v3 edge_list (already sorted by source).
+inline bgl_csr_graph_t make_bgl_csr(const edge_list& edges, vertex_id_t num_vertices) {
+  // BGL CSR ctor wants a range of (vertex_id_t, vertex_id_t) pairs and a
+  // parallel range of edge bundles.
+  std::vector<std::pair<vertex_id_t, vertex_id_t>> pairs;
+  std::vector<bgl_edge_prop>                       props;
+  pairs.reserve(edges.size());
+  props.reserve(edges.size());
+  for (const auto& e : edges) {
+    pairs.emplace_back(e.source_id, e.target_id);
+    props.push_back(bgl_edge_prop{static_cast<double>(e.value)});
+  }
+  return bgl_csr_graph_t(boost::edges_are_sorted,
+                         pairs.begin(), pairs.end(),
+                         props.begin(),
+                         static_cast<std::size_t>(num_vertices));
+}
+
+/// Build a BGL adjacency_list from the same edge_list.
+inline bgl_adj_graph_t make_bgl_adj(const edge_list& edges, vertex_id_t num_vertices) {
+  bgl_adj_graph_t g(static_cast<std::size_t>(num_vertices));
+  for (const auto& e : edges) {
+    boost::add_edge(e.source_id, e.target_id,
+                    bgl_edge_prop{static_cast<double>(e.value)}, g);
+  }
+  return g;
+}
+
+// ---------------------------------------------------------------------------
+// Dijkstra wrapper
+//
+// Uses the no_color_map, no_init variant to match graph-v3 semantics:
+// caller pre-initialises the distance vector; no per-call color-map
+// allocation. Predecessor map is required by the BGL signature even when
+// unused — wired to a dummy iterator_property_map.
+// ---------------------------------------------------------------------------
+
+template <class BglGraph>
+inline void run_bgl_dijkstra(const BglGraph& g,
+                             vertex_id_t source,
+                             std::vector<double>& dist) {
+  using vd_t = typename boost::graph_traits<BglGraph>::vertex_descriptor;
+
+  // Predecessor scratch — required by the API but unused here.
+  std::vector<vd_t> pred(boost::num_vertices(g));
+
+  auto idx       = boost::get(boost::vertex_index, g);
+  auto dist_pmap = boost::make_iterator_property_map(dist.begin(), idx);
+  auto pred_pmap = boost::make_iterator_property_map(pred.begin(), idx);
+  auto w_pmap    = boost::get(&bgl_edge_prop::weight, g);
+
+  boost::dijkstra_shortest_paths_no_color_map_no_init(
+        g, static_cast<vd_t>(source),
+        pred_pmap,
+        dist_pmap,
+        w_pmap,
+        idx,
+        std::less<double>{},
+        boost::closed_plus<double>(),
+        std::numeric_limits<double>::max(),
+        0.0,
+        boost::default_dijkstra_visitor{});
+}
+
+} // namespace graph::benchmark
diff --git a/benchmark/algorithms/dijkstra_fixtures.hpp b/benchmark/algorithms/dijkstra_fixtures.hpp
new file mode 100644
index 0000000..114d9ca
--- /dev/null
+++ b/benchmark/algorithms/dijkstra_fixtures.hpp
@@ -0,0 +1,267 @@
+/**
+ * @file dijkstra_fixtures.hpp
+ * @brief Synthetic graph generators for Dijkstra benchmarks.
+ *
+ * Provides graph generators that isolate three orthogonal axes:
+ *   - Scale        : V ∈ {1K, 10K, 100K}
+ *   - Topology     : Erdős–Rényi, 2D grid, Barabási–Albert, path
+ *   - Weight dist  : uniform, exponential, constant-1
+ *
+ * Each generator returns a sorted edge_list (sorted by source_id, as
+ * required by compressed_graph). Pass the list to make_csr() or make_vov()
+ * to build the target container.
+ *
+ * Usage:
+ *   auto edges = benchmark::erdos_renyi(10'000, 8.0 / 10'000);
+ *   auto g     = benchmark::make_csr(edges, 10'000);
+ *   // ... run Dijkstra on g
+ */
+
+#pragma once
+
+#include <graph/container/compressed_graph.hpp>
+#include <graph/container/dynamic_graph.hpp>
+#include <graph/container/traits/vov_graph_traits.hpp>
+#include <graph/graph_data.hpp>
+
+#include <algorithm>
+#include <cstdint>
+#include <random>
+#include <vector>
+
+namespace graph::benchmark {
+
+// ---------------------------------------------------------------------------
+// Common types
+// ---------------------------------------------------------------------------
+
+using vertex_id_t = uint32_t;
+using weight_t    = double;
+using edge_entry  = graph::copyable_edge_t<vertex_id_t, weight_t>;
+using edge_list   = std::vector<edge_entry>;
+
+/// Primary container: CSR layout; minimises traversal overhead so that
+/// heap cost is the dominant measurable term.
+using csr_graph_t =
+      graph::container::compressed_graph<weight_t, void, void, vertex_id_t, vertex_id_t>;
+
+/// Secondary container: vov-backed dynamic_graph; representative of typical
+/// user code and used as a regression baseline.
+using vov_graph_t =
+      graph::container::dynamic_graph<weight_t, void, void, vertex_id_t, false,
+                                      graph::container::vov_graph_traits<weight_t, void, void,
+                                                                         vertex_id_t, false>>;
+
+// ---------------------------------------------------------------------------
+// Weight distribution
+// ---------------------------------------------------------------------------
+
+enum class weight_dist {
+  uniform,      ///< U[1, 100] — default, "average case"
+  exponential,  ///< Exp(0.1) + 1 — heavy left tail, more decrease-key events
+  constant_one, ///< Always 1 — BFS-equivalent floor, minimum variance
+};
+
+inline double sample_weight(std::mt19937_64& rng, weight_dist dist) {
+  switch (dist) {
+    case weight_dist::uniform: {
+      std::uniform_real_distribution<double> d(1.0, 100.0);
+      return d(rng);
+    }
+    case weight_dist::exponential: {
+      std::exponential_distribution<double> d(0.1);
+      return 1.0 + d(rng);
+    }
+    case weight_dist::constant_one:
+    default:
+      return 1.0;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Erdős–Rényi G(n, p)  — directed, self-loops excluded
+//
+// Uses the O(E) geometric-skip algorithm (Batagelj & Brandes, 2005) instead
+// of the naive O(n²) coin-flip loop, so it scales to n = 10⁶.
+//
+// The n*(n−1) ordered (u,v) pairs with u≠v are enumerated as positions
+//   pos ∈ [0, n*(n−1))
+// where position pos maps to:
+//   u      = pos / (n−1)
+//   offset = pos % (n−1)
+//   v      = offset < u ? offset : offset + 1   (skip self-loop)
+//
+// Set p = k / n for E/V ≈ k (sparse: k=2, moderate: k=8, dense: k=32).
+// The resulting edge list is already sorted by source_id because positions
+// are visited in ascending order and u is non-decreasing.
+// ---------------------------------------------------------------------------
+
+inline edge_list erdos_renyi(vertex_id_t n, double p, uint64_t seed = 42,
+                             weight_dist wdist = weight_dist::uniform) {
+  std::mt19937_64 rng(seed);
+  const size_t    total = static_cast<size_t>(n) * (n - 1); // n*(n-1) directed pairs
+
+  edge_list edges;
+  const size_t expected = static_cast<size_t>(total * p * 1.1) + 16;
+  edges.reserve(expected);
+
+  // Geometric skip: sample the gap between consecutive selected positions.
+  // std::geometric_distribution<size_t> gives the number of failures before
+  // the first success, so adding 1 gives the gap to the *next* success.
+  std::geometric_distribution<size_t> geom(p);
+
+  size_t pos = geom(rng); // 0-indexed position of the first selected edge
+  while (pos < total) {
+    const vertex_id_t u      = static_cast<vertex_id_t>(pos / (n - 1));
+    const vertex_id_t offset = static_cast<vertex_id_t>(pos % (n - 1));
+    const vertex_id_t v      = (offset < u) ? offset : offset + 1;
+    edges.push_back({u, v, sample_weight(rng, wdist)});
+    pos += geom(rng) + 1;
+  }
+  // Edges are already sorted by source_id (u is non-decreasing).
+  return edges;
+}
+
+// ---------------------------------------------------------------------------
+// 2D grid graph (rows × cols)  — bidirectional 4-connected
+//
+// Vertex (r, c) has id r*cols + c.
+// Horizontal and vertical neighbour pairs each get two directed edges
+// (both directions), giving E/V ≈ 4 for interior vertices.
+// The returned list is sorted by source_id.
+// ---------------------------------------------------------------------------
+
+inline edge_list grid_2d(vertex_id_t rows, vertex_id_t cols, uint64_t seed = 42,
+                         weight_dist wdist = weight_dist::uniform) {
+  std::mt19937_64 rng(seed);
+  const vertex_id_t n = rows * cols;
+
+  edge_list edges;
+  edges.reserve(4 * static_cast<size_t>(n)); // upper bound
+
+  for (vertex_id_t r = 0; r < rows; ++r) {
+    for (vertex_id_t c = 0; c < cols; ++c) {
+      vertex_id_t u = r * cols + c;
+      // Right neighbour
+      if (c + 1 < cols) {
+        vertex_id_t v = u + 1;
+        edges.push_back({u, v, sample_weight(rng, wdist)});
+        edges.push_back({v, u, sample_weight(rng, wdist)});
+      }
+      // Down neighbour
+      if (r + 1 < rows) {
+        vertex_id_t v = u + cols;
+        edges.push_back({u, v, sample_weight(rng, wdist)});
+        edges.push_back({v, u, sample_weight(rng, wdist)});
+      }
+    }
+  }
+  std::stable_sort(edges.begin(), edges.end(),
+                   [](const edge_entry& a, const edge_entry& b) {
+                     return a.source_id < b.source_id;
+                   });
+  return edges;
+}
+
+// ---------------------------------------------------------------------------
+// Barabási–Albert preferential attachment  — scale-free / power-law
+//
+// Starts with a fully-connected seed of m0 = max(m, 2) vertices, then
+// adds each subsequent vertex w by selecting m existing targets with
+// probability proportional to their current degree ("urn" method).
+// Both w→t and t→w directed edges are added so the graph is undirected
+// in terms of reachability, which maximises relaxation traffic from hubs.
+// The returned list is sorted by source_id.
+// ---------------------------------------------------------------------------
+
+inline edge_list barabasi_albert(vertex_id_t n, vertex_id_t m, uint64_t seed = 42,
+                                 weight_dist wdist = weight_dist::uniform) {
+  std::mt19937_64 rng(seed);
+
+  // "Urn" stores one entry per endpoint per edge, giving degree-proportional
+  // selection at O(1) per pick (trade memory for simplicity).
+  std::vector<vertex_id_t> urn;
+  urn.reserve(2 * static_cast<size_t>(n) * m);
+
+  edge_list edges;
+  edges.reserve(2 * static_cast<size_t>(n) * m);
+
+  // Seed: fully-connected clique of m0 vertices
+  const vertex_id_t m0 = std::max(m, vertex_id_t{2});
+  for (vertex_id_t u = 0; u < m0; ++u) {
+    for (vertex_id_t v = u + 1; v < m0; ++v) {
+      edges.push_back({u, v, sample_weight(rng, wdist)});
+      edges.push_back({v, u, sample_weight(rng, wdist)});
+      urn.push_back(u);
+      urn.push_back(v);
+    }
+  }
+
+  for (vertex_id_t w = m0; w < n; ++w) {
+    std::vector<vertex_id_t> chosen;
+    chosen.reserve(m);
+
+    while (chosen.size() < m) {
+      std::uniform_int_distribution<size_t> pick(0, urn.size() - 1);
+      vertex_id_t t = urn[pick(rng)];
+      // chosen.size() ≤ m ≤ ~8 so linear scan is fine
+      bool already = (t == w);
+      for (auto x : chosen) already |= (x == t);
+      if (!already) {
+        chosen.push_back(t);
+        edges.push_back({w, t, sample_weight(rng, wdist)});
+        edges.push_back({t, w, sample_weight(rng, wdist)});
+        urn.push_back(w);
+        urn.push_back(t);
+      }
+    }
+  }
+
+  std::stable_sort(edges.begin(), edges.end(),
+                   [](const edge_entry& a, const edge_entry& b) {
+                     return a.source_id < b.source_id;
+                   });
+  return edges;
+}
+
+// ---------------------------------------------------------------------------
+// Path graph: 0 → 1 → 2 → … → (n−1)
+//
+// Minimum decrease-key traffic: each vertex is relaxed at most once.
+// Serves as a lower-bound sanity check.
+// ---------------------------------------------------------------------------
+
+inline edge_list path_graph(vertex_id_t n, uint64_t seed = 42,
+                            weight_dist wdist = weight_dist::uniform) {
+  std::mt19937_64 rng(seed);
+  edge_list       edges;
+  edges.reserve(n > 0 ? n - 1 : 0);
+
+  for (vertex_id_t u = 0; u + 1 < n; ++u) {
+    edges.push_back({u, u + 1, sample_weight(rng, wdist)});
+  }
+  // Already sorted.
+  return edges;
+}
+
+// ---------------------------------------------------------------------------
+// Container builders
+// ---------------------------------------------------------------------------
+
+/// Build a compressed_graph (CSR) from a pre-sorted edge list.
+/// edges must be sorted ascending by source_id (enforced by assertion in
+/// compressed_graph::load_edges).
+inline csr_graph_t make_csr(const edge_list& edges, vertex_id_t num_vertices) {
+  csr_graph_t g;
+  g.load_edges(edges, std::identity{}, num_vertices);
+  return g;
+}
+
+/// Build a vov dynamic_graph from an edge list (order does not matter).
+inline vov_graph_t make_vov(const edge_list& edges, vertex_id_t num_vertices) {
+  vov_graph_t g;
+  g.load_edges(edges, std::identity{}, num_vertices);
+  return g;
+}
+
+} // namespace graph::benchmark
diff --git a/benchmark/data/README.md b/benchmark/data/README.md
new file mode 100644
index 0000000..8355008
--- /dev/null
+++ b/benchmark/data/README.md
@@ -0,0 +1,73 @@
+# Benchmark Data
+
+This directory contains real-world graph data files used as validation
+fixtures in the Dijkstra benchmark suite (Phase 0.2).  The files are
+large and are therefore **not committed** to the repository (see
+`.gitignore`).  Use the instructions below to download them before
+running the real-world validation benchmarks.
+
+---
+
+## Required files
+
+| Filename | Vertices | Edges | Source | Description |
+|----------|----------|-------|--------|-------------|
+| `roadNet-CA.txt` | 1,965,206 | 5,533,214 | SNAP | California road network – classic Dijkstra benchmark, planar/spatial |
+| `web-Google.txt` | 875,713 | 5,105,039 | SNAP | Web-link graph – mixed degree distribution |
+
+---
+
+## Download instructions
+
+### Stanford SNAP graphs
+
+```bash
+# Create the data directory if it does not already exist
+mkdir -p benchmark/data
+
+# California road network
+curl -L "https://snap.stanford.edu/data/roadNet-CA.txt.gz" \
+  | gunzip > benchmark/data/roadNet-CA.txt
+
+# Google web graph
+curl -L "https://snap.stanford.edu/data/web-Google.txt.gz" \
+  | gunzip > benchmark/data/web-Google.txt
+```
+
+Alternatively, download from <https://snap.stanford.edu/data/> and place
+the decompressed `.txt` files in this directory.
+
+---
+
+## File format
+
+SNAP edge-list files use the following format:
+
+```
+# Comment lines start with '#'
+<source_id>\t<target_id>
+```
+
+Vertex ids are 0-based integers.  The benchmark loader skips comment
+lines and treats each remaining line as a directed edge.
+
+---
+
+## Loader
+
+The fixture helper `benchmark/algorithms/dijkstra_fixtures.hpp` will
+gain a `load_snap_graph()` function in Phase 0 that reads these files
+and returns a sorted `edge_list`.  For now, running the real-world
+benchmarks requires that the files are present; if they are absent the
+corresponding benchmark cases are skipped at runtime with a message.
+
+---
+
+## License / attribution
+
+The SNAP graphs are distributed by the Stanford Network Analysis Project
+under their respective licences.  Please cite the original dataset when
+publishing results.
+
+- Jure Leskovec and Andrej Krevl.  *SNAP Datasets: Stanford Large Network
+  Dataset Collection*, <http://snap.stanford.edu/data>, June 2014.
diff --git a/include/graph/algorithm/dijkstra_shortest_paths.hpp b/include/graph/algorithm/dijkstra_shortest_paths.hpp
index 2d60f0b..4df527c 100644
--- a/include/graph/algorithm/dijkstra_shortest_paths.hpp
+++ b/include/graph/algorithm/dijkstra_shortest_paths.hpp
@@ -15,16 +15,53 @@
 #include "graph/graph.hpp"
 #include "graph/algorithm/traversal_common.hpp"
 #include "graph/adj_list/vertex_property_map.hpp"
+#include "graph/detail/indexed_dary_heap.hpp"
+#include "graph/detail/heap_position_map.hpp"
 
 #include <queue>
 #include <ranges>
 #include <format>
+#include <type_traits>
 
 #ifndef GRAPH_DIJKSTRA_SHORTEST_PATHS_HPP
 #  define GRAPH_DIJKSTRA_SHORTEST_PATHS_HPP
 
 namespace graph {
 
+/**
+ * @brief Heap-selector tag: use the historical std::priority_queue path.
+ *
+ * Default selector for `dijkstra_shortest_paths`. Lazy-deletion priority queue;
+ * heap may grow to O(E) and stale entries are skipped at pop time.
+ *
+ * Recommended for: sparse graphs (E/V ≲ 4), grid-like topologies, path/tree
+ * graphs, and any workload with low decrease-key pressure. Phase 4 benchmarks
+ * showed this path wins by 20–40% on grid (E/V≈4) and path (E/V=1) workloads.
+ */
+struct use_default_heap {};
+
+/**
+ * @brief Heap-selector tag: use the indexed d-ary heap with true decrease-key.
+ *
+ * Heap size is bounded by O(V); no stale pops. Supports both index_vertex_range
+ * graphs (dense vector_position_map) and mapped containers / hashable non-dense
+ * vertex ids (assoc_position_map).
+ *
+ * Recommended for: dense or hub-heavy graphs (E/V ≳ 8) where many edges trigger
+ * relaxation. Phase 4 benchmarks at 100K vertices on `compressed_graph`:
+ * Erdős–Rényi (E/V≈8) −25%, Barabási–Albert (E/V≈8) −17% with `Arity=8`.
+ * Loses 20–40% on grid/path workloads where decrease-key is rare.
+ *
+ * `Arity=8` is the recommended setting on x86_64 for high-E/V workloads;
+ * `Arity=4` matches Boost's `d_ary_heap_indirect`.
+ *
+ * @tparam Arity Children per node (default 4 — matches Boost's d_ary_heap_indirect).
+ */
+template <std::size_t Arity = 4>
+struct use_indexed_dary_heap {
+  static constexpr std::size_t arity = Arity;
+};
+
 // Import CPOs and types for use in algorithms
 using adj_list::vertices;
 using adj_list::num_vertices;
@@ -178,6 +215,7 @@ template <
       class Visitor = empty_visitor,
       class Compare = less<distance_fn_value_t<DistanceFn, G>>,
       class Combine = plus<distance_fn_value_t<DistanceFn, G>>,
+      class Heap    = use_default_heap,
       class Alloc   = std::allocator<std::byte>>
 requires distance_fn_for<DistanceFn, G> &&                                //
          predecessor_fn_for<PredecessorFn, G> &&                          //
@@ -195,7 +233,8 @@ constexpr void dijkstra_shortest_paths(
       Visitor&& visitor = empty_visitor(),
       Compare&& compare = less<distance_fn_value_t<DistanceFn, G>>(),
       Combine&& combine = plus<distance_fn_value_t<DistanceFn, G>>(),
-      const Alloc& alloc = Alloc()) {
+      Heap          /*heap_tag*/ = Heap{},
+      const Alloc&  alloc        = Alloc()) {
   using graph_type    = std::remove_reference_t<G>;
   using id_type       = vertex_id_t<graph_type>;
   using distance_type = distance_fn_value_t<DistanceFn, G>;
@@ -234,28 +273,7 @@ constexpr void dijkstra_shortest_paths(
     return false;
   };
 
-  // Define and initialize the priority queue for Dijkstra's algorithm. We use a min-heap based on distance.
-  //
-  // NOTE: std::priority_queue lacks a decrease-key operation, so when a vertex's distance
-  // improves we re-insert it (lazy deletion). The earlier entry becomes stale and is
-  // skipped at pop time (see the stale-pop check in the main loop). This keeps the code
-  // simple but allows the heap to grow to O(E) entries in the worst case.
-  //
-  // A future optimization is to replace this with an indexed d-ary heap supporting true
-  // decrease-key (matching Boost's d_ary_heap_indirect with d=4). That would cap heap
-  // size at O(V), eliminate stale pops, and typically improve cache behavior. See
-  // agents/indexed_dary_heap_plan.md for the design and migration plan.
-  struct weighted_vertex {
-    vertex_t<graph_type> vertex_desc = {};
-    distance_type        weight      = distance_type();
-  };
-  auto qcompare = [&compare](const weighted_vertex& a, const weighted_vertex& b) {
-    return compare(b.weight, a.weight); // min-heap: pop lowest weight first
-  };
-  using WVAlloc = typename std::allocator_traits<Alloc>::template rebind_alloc<weighted_vertex>;
-  using Queue = std::priority_queue<weighted_vertex, std::vector<weighted_vertex, WVAlloc>, decltype(qcompare)>;
-  Queue queue(qcompare, std::vector<weighted_vertex, WVAlloc>(WVAlloc(alloc)));
-
+  // Initialize-vertex visitor callbacks (shared across heap implementations).
   // (The optimizer removes this loop if on_initialize_vertex() is empty.)
   if constexpr (has_on_initialize_vertex<graph_type, Visitor> || has_on_initialize_vertex_id<graph_type, Visitor>) {
     for (auto&& [uid, u] : views::vertexlist(g)) {
@@ -267,86 +285,248 @@ constexpr void dijkstra_shortest_paths(
     }
   }
 
-  // Seed the queue with the initial vertice(s)
-  for (auto&& seed_id : sources) {
-    auto seed_it = find_vertex(g, seed_id);
-    if (seed_it == std::ranges::end(vertices(g))) {
-      throw std::out_of_range(std::format("dijkstra_shortest_paths: source vertex id '{}' is out of range", seed_id));
-    }
-    vertex_t<graph_type> seed = *seed_it;
-
-    distance(g, seed_id) = zero; // mark seed_id as discovered
-    queue.push({seed, zero});
-    if constexpr (has_on_discover_vertex<graph_type, Visitor>) {
-      visitor.on_discover_vertex(g, seed);
-    } else if constexpr (has_on_discover_vertex_id<graph_type, Visitor>) {
-      visitor.on_discover_vertex(g, seed_id);
-    }
-  }
+  // ---------------------------------------------------------------------
+  // Heap-implementation dispatch.
+  //
+  // - use_default_heap         : std::priority_queue with lazy deletion.
+  // - use_indexed_dary_heap<d> : indexed d-ary heap with true decrease-key
+  //                              (heap size bounded by O(V)).
+  //
+  // Both branches honour identical visitor semantics: on_examine_vertex and
+  // on_finish_vertex fire exactly once per reachable vertex; on_edge_relaxed
+  // and on_edge_not_relaxed fire exactly once per outgoing edge of every
+  // examined vertex.
+  // ---------------------------------------------------------------------
+  if constexpr (std::is_same_v<Heap, use_default_heap>) {
+    // -----------------------------------------------------------------
+    // std::priority_queue path (legacy / default).
+    //
+    // std::priority_queue lacks a decrease-key operation, so when a vertex's
+    // distance improves we re-insert it (lazy deletion). The earlier entry
+    // becomes stale and is skipped at pop time. This keeps the code simple
+    // but allows the heap to grow to O(E) entries in the worst case.
+    // -----------------------------------------------------------------
+    struct weighted_vertex {
+      vertex_t<graph_type> vertex_desc = {};
+      distance_type        weight      = distance_type();
+    };
+    auto qcompare = [&compare](const weighted_vertex& a, const weighted_vertex& b) {
+      return compare(b.weight, a.weight); // min-heap: pop lowest weight first
+    };
+    using WVAlloc = typename std::allocator_traits<Alloc>::template rebind_alloc<weighted_vertex>;
+    using Queue = std::priority_queue<weighted_vertex, std::vector<weighted_vertex, WVAlloc>, decltype(qcompare)>;
+    Queue queue(qcompare, std::vector<weighted_vertex, WVAlloc>(WVAlloc(alloc)));
 
-  // Main loop to process the queue
-  while (!queue.empty()) {
-    auto [u, w] = queue.top();
-    queue.pop();
-    const id_type uid = vertex_id(g, u);
-
-    // Skip stale queue entries: because std::priority_queue lacks decrease-key,
-    // we re-insert vertices when their distance is improved. The earlier (larger)
-    // entry is still in the heap and must be ignored when popped. This also
-    // ensures on_examine_vertex / on_finish_vertex fire exactly once per vertex,
-    // matching BGL visitor semantics.
-    if (compare(distance(g, uid), w)) {
-      continue;
-    }
+    // Seed the queue with the initial vertice(s)
+    for (auto&& seed_id : sources) {
+      auto seed_it = find_vertex(g, seed_id);
+      if (seed_it == std::ranges::end(vertices(g))) {
+        throw std::out_of_range(std::format("dijkstra_shortest_paths: source vertex id '{}' is out of range", seed_id));
+      }
+      vertex_t<graph_type> seed = *seed_it;
 
-    if constexpr (has_on_examine_vertex<graph_type, Visitor>) {
-      visitor.on_examine_vertex(g, u);
-    } else     if constexpr (has_on_examine_vertex_id<graph_type, Visitor>) {
-      visitor.on_examine_vertex(g, uid);
+      distance(g, seed_id) = zero; // mark seed_id as discovered
+      queue.push({seed, zero});
+      if constexpr (has_on_discover_vertex<graph_type, Visitor>) {
+        visitor.on_discover_vertex(g, seed);
+      } else if constexpr (has_on_discover_vertex_id<graph_type, Visitor>) {
+        visitor.on_discover_vertex(g, seed_id);
+      }
     }
 
-    // Process all outgoing edges from the current vertex
-    for (auto&& [vid, uv] : views::incidence(g, u)) {
-      if constexpr (has_on_examine_edge<graph_type, Visitor>) {
-        visitor.on_examine_edge(g, uv);
+    // Main loop to process the queue
+    while (!queue.empty()) {
+      auto [u, w] = queue.top();
+      queue.pop();
+      const id_type uid = vertex_id(g, u);
+
+      // Skip stale queue entries: because std::priority_queue lacks decrease-key,
+      // we re-insert vertices when their distance is improved. The earlier (larger)
+      // entry is still in the heap and must be ignored when popped. This also
+      // ensures on_examine_vertex / on_finish_vertex fire exactly once per vertex,
+      // matching BGL visitor semantics.
+      if (compare(distance(g, uid), w)) {
+        continue;
       }
 
-      // Use the user-supplied comparator for "undiscovered" detection so that
-      // custom Compare orderings remain consistent (matches BGL's
-      // !distance_compare(neighbor_distance, infinity)).
-      const bool is_neighbor_undiscovered = !compare(distance(g, vid), infinite);
-      const bool was_edge_relaxed         = relax_target(uv, uid);
+      if constexpr (has_on_examine_vertex<graph_type, Visitor>) {
+        visitor.on_examine_vertex(g, u);
+      } else if constexpr (has_on_examine_vertex_id<graph_type, Visitor>) {
+        visitor.on_examine_vertex(g, uid);
+      }
 
-      if (was_edge_relaxed) {
-        if constexpr (has_on_edge_relaxed<graph_type, Visitor>) {
-          visitor.on_edge_relaxed(g, uv);
+      // Process all outgoing edges from the current vertex
+      for (auto&& [vid, uv] : views::incidence(g, u)) {
+        if constexpr (has_on_examine_edge<graph_type, Visitor>) {
+          visitor.on_examine_edge(g, uv);
         }
-        vertex_t<graph_type> v = target(g, uv);
-        if (is_neighbor_undiscovered) {
-          if constexpr (has_on_discover_vertex<graph_type, Visitor>) {
-            visitor.on_discover_vertex(g, v);
-          } else if constexpr (has_on_discover_vertex_id<graph_type, Visitor>) {
-            visitor.on_discover_vertex(g, vid);
+
+        // Use the user-supplied comparator for "undiscovered" detection so that
+        // custom Compare orderings remain consistent (matches BGL's
+        // !distance_compare(neighbor_distance, infinity)).
+        const bool is_neighbor_undiscovered = !compare(distance(g, vid), infinite);
+        const bool was_edge_relaxed         = relax_target(uv, uid);
+
+        if (was_edge_relaxed) {
+          if constexpr (has_on_edge_relaxed<graph_type, Visitor>) {
+            visitor.on_edge_relaxed(g, uv);
+          }
+          vertex_t<graph_type> v = target(g, uv);
+          if (is_neighbor_undiscovered) {
+            if constexpr (has_on_discover_vertex<graph_type, Visitor>) {
+              visitor.on_discover_vertex(g, v);
+            } else if constexpr (has_on_discover_vertex_id<graph_type, Visitor>) {
+              visitor.on_discover_vertex(g, vid);
+            }
+          }
+          queue.push({v, distance(g, vid)});
+        } else {
+          if constexpr (has_on_edge_not_relaxed<graph_type, Visitor>) {
+            visitor.on_edge_not_relaxed(g, uv);
           }
         }
-        queue.push({v, distance(g, vid)});
-      } else {
-        if constexpr (has_on_edge_not_relaxed<graph_type, Visitor>) {
-          visitor.on_edge_not_relaxed(g, uv);
+      }
+
+      // The stale-pop skip at the top of the loop guarantees we only reach this
+      // point on the settled (final) pop of u, so on_examine_vertex and
+      // on_finish_vertex are each called exactly once per reachable vertex,
+      // matching BGL visitor semantics.
+      if constexpr (has_on_finish_vertex<graph_type, Visitor>) {
+        visitor.on_finish_vertex(g, u);
+      } else if constexpr (has_on_finish_vertex_id<graph_type, Visitor>) {
+        visitor.on_finish_vertex(g, uid);
+      }
+    } // while(!queue.empty())
+  } else {
+    // -----------------------------------------------------------------
+    // indexed d-ary heap path.
+    //
+    // True decrease-key: at most one heap entry per vertex (size <= V).
+    // No stale pops. Vertex distances are read live via DistanceFn so the
+    // heap order tracks the current best-known distance.
+    //
+    // The position-map adapter is selected at compile time:
+    //
+    //   - index_vertex_range<G>  : vector_position_map (dense O(V) array).
+    //   - mapped containers / non-dense ids
+    //                            : assoc_position_map  (unordered_map).
+    //
+    // The vector adapter is faster (no hashing, contiguous storage) but
+    // requires vertex ids in [0, num_vertices(g)). Mapped containers
+    // (mov, mod, uov, ...) and any graph whose vertex_id_t is non-integral
+    // fall through to the associative adapter automatically.
+    // -----------------------------------------------------------------
+    constexpr std::size_t arity = Heap::arity;
+
+    // Live distance lookup for the heap (reads, never writes).
+    auto heap_distfn = [&g, &distance](const id_type& k) -> const distance_type& {
+      return distance(g, k);
+    };
+
+    // Seed + main loop, generic over the heap type so the dense and sparse
+    // position-map branches share a single body.
+    auto run = [&](auto& heap) {
+      // Seed the heap with the initial vertice(s).
+      for (auto&& seed_id : sources) {
+        auto seed_it = find_vertex(g, seed_id);
+        if (seed_it == std::ranges::end(vertices(g))) {
+          throw std::out_of_range(
+                std::format("dijkstra_shortest_paths: source vertex id '{}' is out of range", seed_id));
+        }
+
+        distance(g, seed_id) = zero; // mark seed_id as discovered
+        heap.push(static_cast<id_type>(seed_id));
+        if constexpr (has_on_discover_vertex<graph_type, Visitor>) {
+          visitor.on_discover_vertex(g, *seed_it);
+        } else if constexpr (has_on_discover_vertex_id<graph_type, Visitor>) {
+          visitor.on_discover_vertex(g, seed_id);
         }
       }
-    }
 
-    // The stale-pop skip at the top of the loop guarantees we only reach this
-    // point on the settled (final) pop of u, so on_examine_vertex and
-    // on_finish_vertex are each called exactly once per reachable vertex,
-    // matching BGL visitor semantics.
-    if constexpr (has_on_finish_vertex<graph_type, Visitor>) {
-      visitor.on_finish_vertex(g, u);
-    } else if constexpr (has_on_finish_vertex_id<graph_type, Visitor>) {
-      visitor.on_finish_vertex(g, uid);
+      // Main loop. With true decrease-key there are no stale entries: every
+      // pop yields the next finalized vertex.
+      while (!heap.empty()) {
+        const id_type uid = heap.top();
+        heap.pop();
+        vertex_t<graph_type> u = *find_vertex(g, uid);
+
+        if constexpr (has_on_examine_vertex<graph_type, Visitor>) {
+          visitor.on_examine_vertex(g, u);
+        } else if constexpr (has_on_examine_vertex_id<graph_type, Visitor>) {
+          visitor.on_examine_vertex(g, uid);
+        }
+
+        for (auto&& [vid, uv] : views::incidence(g, u)) {
+          if constexpr (has_on_examine_edge<graph_type, Visitor>) {
+            visitor.on_examine_edge(g, uv);
+          }
+
+          const bool is_neighbor_undiscovered = !compare(distance(g, vid), infinite);
+          const bool was_edge_relaxed         = relax_target(uv, uid);
+
+          if (was_edge_relaxed) {
+            if constexpr (has_on_edge_relaxed<graph_type, Visitor>) {
+              visitor.on_edge_relaxed(g, uv);
+            }
+            if (is_neighbor_undiscovered) {
+              if constexpr (has_on_discover_vertex<graph_type, Visitor>) {
+                vertex_t<graph_type> v = target(g, uv);
+                visitor.on_discover_vertex(g, v);
+              } else if constexpr (has_on_discover_vertex_id<graph_type, Visitor>) {
+                visitor.on_discover_vertex(g, vid);
+              }
+              heap.push(vid);
+            } else {
+              // v has finite distance and was just improved; under Dijkstra's
+              // non-negative-weight invariant a finalized vertex cannot be
+              // relaxed, so v must still be in the heap.
+              heap.decrease(vid);
+            }
+          } else {
+            if constexpr (has_on_edge_not_relaxed<graph_type, Visitor>) {
+              visitor.on_edge_not_relaxed(g, uv);
+            }
+          }
+        }
+
+        if constexpr (has_on_finish_vertex<graph_type, Visitor>) {
+          visitor.on_finish_vertex(g, u);
+        } else if constexpr (has_on_finish_vertex_id<graph_type, Visitor>) {
+          visitor.on_finish_vertex(g, uid);
+        }
+      } // while(!heap.empty())
+    };
+
+    using HeapAlloc = typename std::allocator_traits<Alloc>::template rebind_alloc<id_type>;
+
+    if constexpr (adj_list::index_vertex_range<graph_type>) {
+      // ---- Dense path: vector_position_map ----
+      // The position vector uses the default allocator; the user-supplied
+      // Alloc is forwarded only to the heap's internal storage (matching the
+      // documented role of Alloc as "internal priority queue storage").
+      std::vector<std::size_t> positions(num_vertices(g), detail::vector_position_map::npos);
+      using HeapT = detail::indexed_dary_heap<id_type, decltype(heap_distfn), Compare,
+                                              detail::vector_position_map, arity, HeapAlloc>;
+      HeapT heap(heap_distfn, compare,
+                 detail::vector_position_map{positions},
+                 HeapAlloc(alloc));
+      run(heap);
+    } else {
+      // ---- Sparse / mapped path: assoc_position_map ----
+      static_assert(adj_list::hashable_vertex_id<graph_type>,
+                    "use_indexed_dary_heap requires either index_vertex_range<G> or a "
+                    "hashable vertex_id_t<G> for the associative position-map adapter.");
+
+      using PMap = detail::assoc_position_map<id_type>;
+      typename PMap::map_type positions;
+      positions.reserve(num_vertices(g));
+
+      using HeapT = detail::indexed_dary_heap<id_type, decltype(heap_distfn), Compare,
+                                              PMap, arity, HeapAlloc>;
+      HeapT heap(heap_distfn, compare, PMap{positions}, HeapAlloc(alloc));
+      run(heap);
     }
-  } // while(!queue.empty())
+  } // if constexpr Heap dispatch
 }
 
 /**
@@ -368,6 +548,7 @@ template <
       class Visitor = empty_visitor,
       class Compare = less<distance_fn_value_t<DistanceFn, G>>,
       class Combine = plus<distance_fn_value_t<DistanceFn, G>>,
+      class Heap    = use_default_heap,
       class Alloc   = std::allocator<std::byte>>
 requires distance_fn_for<DistanceFn, G> &&                                //
          predecessor_fn_for<PredecessorFn, G> &&                          //
@@ -384,9 +565,11 @@ constexpr void dijkstra_shortest_paths(
       Visitor&& visitor = empty_visitor(),
       Compare&& compare = less<distance_fn_value_t<DistanceFn, G>>(),
       Combine&& combine = plus<distance_fn_value_t<DistanceFn, G>>(),
-      const Alloc& alloc = Alloc()) {
+      Heap         heap_tag = Heap{},
+      const Alloc& alloc    = Alloc()) {
   dijkstra_shortest_paths(g, subrange(&source, (&source + 1)), distance, predecessor, weight,
-                          forward<Visitor>(visitor), forward<Compare>(compare), forward<Combine>(combine), alloc);
+                          forward<Visitor>(visitor), forward<Compare>(compare), forward<Combine>(combine),
+                          heap_tag, alloc);
 }
 
 /**
@@ -431,6 +614,7 @@ template <
       class Visitor = empty_visitor,
       class Compare = less<distance_fn_value_t<DistanceFn, G>>,
       class Combine = plus<distance_fn_value_t<DistanceFn, G>>,
+      class Heap    = use_default_heap,
       class Alloc   = std::allocator<std::byte>>
 requires distance_fn_for<DistanceFn, G> &&                                                //
          convertible_to<range_value_t<Sources>, vertex_id_t<G>> &&                              //
@@ -446,9 +630,10 @@ constexpr void dijkstra_shortest_distances(
       Visitor&& visitor = empty_visitor(),
       Compare&& compare = less<distance_fn_value_t<DistanceFn, G>>(),
       Combine&& combine = plus<distance_fn_value_t<DistanceFn, G>>(),
-      const Alloc& alloc = Alloc()) {
+      Heap         heap_tag = Heap{},
+      const Alloc& alloc    = Alloc()) {
   dijkstra_shortest_paths(g, sources, distance, _null_predecessor, forward<WF>(weight), forward<Visitor>(visitor),
-                          forward<Compare>(compare), forward<Combine>(combine), alloc);
+                          forward<Compare>(compare), forward<Combine>(combine), heap_tag, alloc);
 }
 
 /**
@@ -471,6 +656,7 @@ template <
       class Visitor = empty_visitor,
       class Compare = less<distance_fn_value_t<DistanceFn, G>>,
       class Combine = plus<distance_fn_value_t<DistanceFn, G>>,
+      class Heap    = use_default_heap,
       class Alloc   = std::allocator<std::byte>>
 requires distance_fn_for<DistanceFn, G> &&                                                //
          basic_edge_weight_function<G, WF, distance_fn_value_t<DistanceFn, G>, Compare, Combine>
@@ -485,9 +671,11 @@ constexpr void dijkstra_shortest_distances(
       Visitor&& visitor = empty_visitor(),
       Compare&& compare = less<distance_fn_value_t<DistanceFn, G>>(),
       Combine&& combine = plus<distance_fn_value_t<DistanceFn, G>>(),
-      const Alloc& alloc = Alloc()) {
+      Heap         heap_tag = Heap{},
+      const Alloc& alloc    = Alloc()) {
   dijkstra_shortest_paths(g, subrange(&source, (&source + 1)), distance, _null_predecessor, forward<WF>(weight),
-                          forward<Visitor>(visitor), forward<Compare>(compare), forward<Combine>(combine), alloc);
+                          forward<Visitor>(visitor), forward<Compare>(compare), forward<Combine>(combine),
+                          heap_tag, alloc);
 }
 
 } // namespace graph
diff --git a/include/graph/algorithm/mst.hpp b/include/graph/algorithm/mst.hpp
index b0ca2d5..84638a3 100644
--- a/include/graph/algorithm/mst.hpp
+++ b/include/graph/algorithm/mst.hpp
@@ -194,17 +194,30 @@
  * **Performance Notes:**
  * 
  * **Prim's Priority Queue:**
- * This implementation uses a binary heap (std::priority_queue) which provides O(E log V)
- * complexity. While Fibonacci heap implementations achieve better theoretical complexity
- * O(E + V log V), they have significantly higher constant factors and more complex
- * bookkeeping. In practice:
- * - **Binary heap is faster** for most real-world graphs (used here)
- * - **Fibonacci heap** only wins for extremely dense graphs where E ≈ V² 
- *   and the improved amortized decrease-key operation dominates
- * - **Simple array** (O(V²)) is fastest for complete graphs where E = V(V-1)/2
- * 
- * Benchmark testing shows binary heap is optimal for graphs with 100-100,000 vertices
- * and typical densities (E = O(V) to O(V^1.5)).
+ * `prim()` is implemented as a thin wrapper over `dijkstra_shortest_paths`,
+ * so it inherits the same `Heap` template parameter:
+ *
+ * - `use_default_heap` (default): `std::priority_queue` with lazy deletion.
+ *   Provides O(E log V). Good general-purpose choice.
+ * - `use_indexed_dary_heap<D>`: indexed d-ary heap with true O(log_D V)
+ *   decrease-key. Recommended opt-in for high-E/V random / scale-free
+ *   workloads on `compressed_graph` (typically `D = 8`); see Dijkstra
+ *   Phase 4 results.
+ *
+ * Implementation note: because Prim's relaxation criterion is
+ * `compare(w_uv, weight[v])` rather than Dijkstra's
+ * `compare(d_u + w_uv, distance[v])`, Prim does not satisfy the
+ * monotonicity invariant Dijkstra's main loop assumes. `prim()` therefore
+ * maintains a small `std::vector<bool> finalized(V)` and wraps `weight_fn`
+ * so finalized targets report `+infinity`, which prevents the relax step
+ * from corrupting `weight[]` (the MST output) or calling `decrease()` on
+ * a vertex that has already been popped. See
+ * `agents/indexed_dary_heap_plan.md` § Phase 5.2 for the full discussion
+ * and the standalone-Prim alternative ("Option 2").
+ *
+ * Fibonacci heap implementations achieve O(E + V log V) but have higher
+ * constant factors and are not used here. A simple array (O(V²)) is fastest
+ * only for complete graphs.
  * 
  * ---
  * 
@@ -257,6 +270,8 @@
 #include "graph/algorithm/dijkstra_shortest_paths.hpp"
 #include <queue>
 #include <format>
+#include <vector>
+#include <unordered_set>
 
 #ifndef GRAPH_MST_HPP
 #  define GRAPH_MST_HPP
@@ -896,6 +911,7 @@ template <adjacency_list G,
           class          PredecessorFn,
           class WF = function<distance_fn_value_t<WeightFn, G>(const std::remove_reference_t<G>&, const edge_t<G>&)>,
           class CompareOp = less<distance_fn_value_t<WeightFn, G>>,
+          class Heap      = use_default_heap,
           class Alloc     = std::allocator<std::byte>>
 requires distance_fn_for<WeightFn, G> &&
          is_arithmetic_v<distance_fn_value_t<WeightFn, G>> &&
@@ -909,21 +925,104 @@ auto prim(G&&                   g,           // graph
                 [](const auto& gr, const edge_t<G>& uv) {
                   return edge_value(gr, uv);
                 }, // default weight_fn(g, uv) -> edge_value(g, uv)
-          CompareOp    compare = less<distance_fn_value_t<WeightFn, G>>(), // edge value comparator
-          const Alloc& alloc   = Alloc()
+          CompareOp    compare  = less<distance_fn_value_t<WeightFn, G>>(), // edge value comparator
+          Heap         heap_tag = Heap{},                                   // heap selector (use_default_heap or use_indexed_dary_heap<D>)
+          const Alloc& alloc    = Alloc()
 ) {
   using edge_value_type = distance_fn_value_t<WeightFn, G>;
+  using id_type         = vertex_id_t<G>;
 
   // Prim's combine: ignore accumulated distance, use edge weight directly.
   // This transforms Dijkstra's relaxation check from compare(d_u + w, d_v)
   // to compare(w, d_v), which is exactly Prim's criterion.
   auto prim_combine = [](edge_value_type /*d_u*/, edge_value_type w_uv) -> edge_value_type { return w_uv; };
 
-  dijkstra_shortest_paths(g, seed,
-                          std::forward<WeightFn>(weight),
-                          std::forward<PredecessorFn>(predecessor),
-                          std::forward<WF>(weight_fn), empty_visitor(),
-                          std::forward<CompareOp>(compare), prim_combine, alloc);
+  // ---------------------------------------------------------------------
+  // Prim correctness shim ("Option 1" in indexed_dary_heap_plan.md § 5.2).
+  //
+  // Dijkstra's main loop relies on the monotonicity invariant:
+  //   with non-negative weights and combine = plus, a finalized vertex's
+  //   distance can never be improved later.
+  // It therefore omits a "skip if finalized" guard in the relax step.
+  //
+  // Prim's combine (return w_uv, ignoring d_u) breaks that invariant:
+  // a finalized vertex v carries weight[v] = w_xv (the cheapest tree-edge
+  // found before v was popped). A later-popped neighbor y may present an
+  // edge y -> v with w_yv < weight[v]; relaxing it would *overwrite* the
+  // MST output (weight[] is the result, not a working distance), and with
+  // the indexed heap would call decrease() on a vertex whose heap position
+  // is npos -> out-of-bounds.
+  //
+  // Fix: track which vertices have been finalized and force weight_fn to
+  // report +infinity for any edge whose target is already finalized. The
+  // relax then computes compare(infinite, weight[v]) = false and the edge
+  // is skipped. This is correct for Prim because by definition a finalized
+  // vertex is already in the MST and cannot accept a cheaper tree-edge.
+  //
+  // The bitset add ~1 bit per vertex of memory and one predictable
+  // branch + bit-load per edge in the inner loop. See "Option 2" in the
+  // plan for a faster but more invasive standalone-Prim alternative.
+  //
+  // Storage strategy: for graphs with index_vertex_range (dense integer
+  // ids in [0, num_vertices)), a std::vector<bool> indexed by id is fast
+  // and cache-friendly. For sparse / mapped graphs whose ids are not
+  // contiguous (or not integral), an unordered_set is used instead.
+  // ---------------------------------------------------------------------
+  using GraphT = std::remove_reference_t<G>;
+  if constexpr (adj_list::index_vertex_range<GraphT>) {
+    std::vector<bool> finalized(num_vertices(g), false);
+
+    struct prim_finish_visitor {
+      std::vector<bool>* finalized_ptr;
+      void on_finish_vertex(const GraphT& /*gr*/, const id_type& uid) const {
+        (*finalized_ptr)[static_cast<std::size_t>(uid)] = true;
+      }
+    };
+    prim_finish_visitor visitor{&finalized};
+
+    auto wf_ref = std::ref(weight_fn);
+    auto guarded_weight_fn = [&finalized, wf_ref](
+          const GraphT& gr, const edge_t<G>& uv) -> edge_value_type {
+      const id_type vid = target_id(gr, uv);
+      if (finalized[static_cast<std::size_t>(vid)]) {
+        return infinite_distance<edge_value_type>();
+      }
+      return wf_ref.get()(gr, uv);
+    };
+
+    dijkstra_shortest_paths(g, seed,
+                            std::forward<WeightFn>(weight),
+                            std::forward<PredecessorFn>(predecessor),
+                            guarded_weight_fn, visitor,
+                            std::forward<CompareOp>(compare), prim_combine, heap_tag, alloc);
+  } else {
+    std::unordered_set<id_type> finalized;
+    finalized.reserve(num_vertices(g));
+
+    struct prim_finish_visitor {
+      std::unordered_set<id_type>* finalized_ptr;
+      void on_finish_vertex(const GraphT& /*gr*/, const id_type& uid) const {
+        finalized_ptr->insert(uid);
+      }
+    };
+    prim_finish_visitor visitor{&finalized};
+
+    auto wf_ref = std::ref(weight_fn);
+    auto guarded_weight_fn = [&finalized, wf_ref](
+          const GraphT& gr, const edge_t<G>& uv) -> edge_value_type {
+      const id_type vid = target_id(gr, uv);
+      if (finalized.contains(vid)) {
+        return infinite_distance<edge_value_type>();
+      }
+      return wf_ref.get()(gr, uv);
+    };
+
+    dijkstra_shortest_paths(g, seed,
+                            std::forward<WeightFn>(weight),
+                            std::forward<PredecessorFn>(predecessor),
+                            guarded_weight_fn, visitor,
+                            std::forward<CompareOp>(compare), prim_combine, heap_tag, alloc);
+  }
 
   // Calculate total MST weight by summing edge weights
   edge_value_type total_weight = edge_value_type{};
diff --git a/include/graph/detail/heap_position_map.hpp b/include/graph/detail/heap_position_map.hpp
new file mode 100644
index 0000000..3a61695
--- /dev/null
+++ b/include/graph/detail/heap_position_map.hpp
@@ -0,0 +1,114 @@
+/**
+ * @file heap_position_map.hpp
+ * @brief Position-map adapters for indexed_dary_heap.
+ *
+ * Two adapters are provided:
+ *
+ *   - vector_position_map  : O(1) lookup for integral keys in a known dense
+ *                            range [0, n). Backed by a caller-owned
+ *                            std::vector<size_t>.
+ *
+ *   - assoc_position_map   : O(1) average lookup for arbitrary hashable keys.
+ *                            Backed by a caller-owned std::unordered_map<Key, size_t>.
+ *                            Use this when vertex ids are sparse, non-integral,
+ *                            or come from a mapped graph container.
+ *
+ * Both adapters store a pointer to their backing storage; the storage must
+ * outlive the heap. This lets the caller reuse the same map across multiple
+ * Dijkstra runs (call reset() between runs).
+ *
+ * Concept (informal):
+ *   - sentinel: static constexpr size_t npos
+ *   - size_t  position(Key) const          // returns npos if not present
+ *   - void    set_position(Key, size_t)    // npos means "remove"
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <unordered_map>
+#include <vector>
+
+namespace graph::detail {
+
+// ---------------------------------------------------------------------------
+// vector_position_map
+//
+// O(1) position map for integral keys in [0, n). The caller owns the storage
+// vector, sized to n and initialised to npos. set_position(k, npos) marks k
+// as absent. reset() clears the entire map in O(n).
+// ---------------------------------------------------------------------------
+
+class vector_position_map {
+public:
+  static constexpr std::size_t npos = static_cast<std::size_t>(-1);
+
+  explicit vector_position_map(std::vector<std::size_t>& storage) noexcept
+        : storage_(&storage) {}
+
+  template <class Key>
+  [[nodiscard]] std::size_t position(const Key& k) const noexcept {
+    return (*storage_)[static_cast<std::size_t>(k)];
+  }
+
+  template <class Key>
+  void set_position(const Key& k, std::size_t pos) noexcept {
+    (*storage_)[static_cast<std::size_t>(k)] = pos;
+  }
+
+  /// Reset all entries to npos. O(n).
+  void reset() noexcept { std::fill(storage_->begin(), storage_->end(), npos); }
+
+  [[nodiscard]] std::size_t capacity() const noexcept { return storage_->size(); }
+
+private:
+  std::vector<std::size_t>* storage_;
+};
+
+// ---------------------------------------------------------------------------
+// assoc_position_map
+//
+// O(1) average position map for hashable keys (e.g. when vertex ids come from
+// a mapped graph and are non-contiguous, or non-integral entirely).
+//
+// Storage is a caller-owned std::unordered_map<Key, size_t>. set_position
+// with npos erases the key, keeping the map's size equal to the heap's size
+// at all times — so contains(k) reduces to a single lookup.
+// ---------------------------------------------------------------------------
+
+template <class Key,
+          class Hash    = std::hash<Key>,
+          class KeyEq   = std::equal_to<Key>,
+          class Alloc   = std::allocator<std::pair<const Key, std::size_t>>>
+class assoc_position_map {
+public:
+  using map_type                      = std::unordered_map<Key, std::size_t, Hash, KeyEq, Alloc>;
+  static constexpr std::size_t npos   = static_cast<std::size_t>(-1);
+
+  explicit assoc_position_map(map_type& storage) noexcept : storage_(&storage) {}
+
+  [[nodiscard]] std::size_t position(const Key& k) const {
+    auto it = storage_->find(k);
+    return (it == storage_->end()) ? npos : it->second;
+  }
+
+  void set_position(const Key& k, std::size_t pos) {
+    if (pos == npos) {
+      storage_->erase(k);
+    } else {
+      // Use insert_or_assign for O(1) amortised update with no temporary.
+      storage_->insert_or_assign(k, pos);
+    }
+  }
+
+  /// Drop all entries. O(n).
+  void reset() noexcept(noexcept(storage_->clear())) { storage_->clear(); }
+
+  [[nodiscard]] std::size_t tracked_size() const noexcept { return storage_->size(); }
+
+private:
+  map_type* storage_;
+};
+
+} // namespace graph::detail
diff --git a/include/graph/detail/indexed_dary_heap.hpp b/include/graph/detail/indexed_dary_heap.hpp
new file mode 100644
index 0000000..68c681c
--- /dev/null
+++ b/include/graph/detail/indexed_dary_heap.hpp
@@ -0,0 +1,291 @@
+/**
+ * @file indexed_dary_heap.hpp
+ * @brief External-key, indirect-comparison d-ary min-heap with O(log_d N)
+ *        decrease-key.
+ *
+ * Designed for Dijkstra and Prim where:
+ *   - Vertex ids serve as stable external keys.
+ *   - Distances live in a user-supplied container, accessed via a callable
+ *     @c DistanceFn(Key) -> const Distance& .
+ *   - The relax step needs O(log_d N) `decrease(key)` rather than O(N) re-push.
+ *
+ * The heap stores keys only. Distances are read live through @c DistanceFn so
+ * the heap never goes stale: when the algorithm updates a distance and calls
+ * @c decrease(k), the heap re-orders k against the current distance values.
+ *
+ * Position bookkeeping (key → heap index) is delegated to a @c PositionMap.
+ * A heap of N entries always has exactly one position recorded per contained
+ * key; non-contained keys map to @c npos. Every write to @c heap_[i] funnels
+ * through @c place_() to keep the map in sync.
+ *
+ * Complexity (Arity = d):
+ *   - push     : O(log_d N)
+ *   - pop      : O(d · log_d N)
+ *   - decrease : O(log_d N)
+ *   - top      : O(1)
+ *   - contains : O(1) lookup in the position map
+ *
+ * d = 4 minimises the product (d / log d) on typical Dijkstra workloads;
+ * see Boost.Graph's `d_ary_heap_indirect` and references therein.
+ *
+ * Concept-style requirements on @c PositionMap:
+ *   std::size_t pm.position(Key) const;          // returns indexed_dary_heap::npos if not present
+ *   void        pm.set_position(Key, std::size_t);
+ *
+ * Two adapters are provided in @c heap_position_map.hpp:
+ *   - @c vector_position_map  for dense integral keys
+ *   - @c assoc_position_map   for sparse / hashable keys
+ */
+
+#pragma once
+
+#include "heap_position_map.hpp"
+
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+// ---------------------------------------------------------------------------
+// GRAPH_DETAIL_FORCE_INLINE
+//
+// Force-inline attribute applied to the heap's hot helpers: less_than_,
+// place_, sift_up_, sift_down_.
+//
+// Background (Phase 4.3b, agents/indexed_dary_heap_results.md):
+// VTune software-mode hotspots on MSVC /O2 showed sift_down_ as a distinct
+// 31% call frame, with std::less<double>::operator() appearing as three
+// separate non-inlined copies and container_value_fn::operator() as a real
+// call — together ~50% of CPU time on Grid_Idx4/100K.
+//
+// Investigation (Phases 4.3c–d):
+// - Annotating only less_than_/place_ (result 004): no effect — the outer
+//   sift_down_ call frame still dominates; inner force-inline is local to
+//   the sift body and does not collapse the outer boundary.
+// - Annotating sift_down_/sift_up_ as well (result 005): also no effect —
+//   MSVC silently ignores __forceinline on functions of this complexity
+//   regardless of the annotation when the call site is a large template
+//   instantiation. The /O2 /Ob2 inline budget is the real blocker.
+//
+// The annotations are kept as-is because:
+//   (a) They are a no-op on GCC/Clang (already inlined at -O2+).
+//   (b) They document intent and may become effective with /Ob3 or a
+//       future MSVC version.
+//   (c) The next investigative step is to try /Ob3 in the release preset
+//       (see agents/indexed_dary_heap_results.md Phase 4.3d next steps).
+// ---------------------------------------------------------------------------
+#if defined(_MSC_VER)
+#  define GRAPH_DETAIL_FORCE_INLINE __forceinline
+#elif defined(__GNUC__) || defined(__clang__)
+#  define GRAPH_DETAIL_FORCE_INLINE [[gnu::always_inline]] inline
+#else
+#  define GRAPH_DETAIL_FORCE_INLINE inline
+#endif
+
+
+namespace graph::detail {
+
+// ---------------------------------------------------------------------------
+// indexed_dary_heap
+// ---------------------------------------------------------------------------
+
+// Arity is intentionally a compile-time template parameter, not a runtime value.
+// The performance benefit of a d-ary heap over a binary heap comes from
+// reducing tree height (fewer cache misses on decrease-key) while keeping the
+// inner child-scan loop tight enough to fit in registers.  That inner loop
+// iterates over exactly Arity children in sift_down and is the hottest path
+// during Dijkstra.  A compile-time Arity allows the compiler to fully unroll
+// it, elide the loop counter, and apply SIMD optimisations.  A runtime arity
+// would turn it into a variable-count loop and forfeit those gains.
+// The default Arity=4 minimises the product (d / log2 d) on typical
+// Dijkstra workloads; see Boost.Graph's d_ary_heap_indirect and the
+// analysis in agents/indexed_dary_heap_plan.md § Open Questions.
+template <class Key,
+          class DistanceFn,
+          class Compare,
+          class PositionMap,
+          std::size_t Arity     = 4,
+          class Allocator       = std::allocator<Key>>
+class indexed_dary_heap {
+  static_assert(Arity >= 2, "Arity must be at least 2");
+
+public:
+  using key_type        = Key;
+  using size_type       = std::size_t;
+  using distance_fn     = DistanceFn;
+  using compare_type    = Compare;
+  using position_map    = PositionMap;
+  using allocator_type  = Allocator;
+
+  static constexpr size_type arity = Arity;
+  static constexpr size_type npos  = static_cast<size_type>(-1);
+
+  indexed_dary_heap(DistanceFn dist, Compare comp, PositionMap pmap,
+                    const Allocator& alloc = Allocator())
+        : heap_(alloc), distance_(std::move(dist)), compare_(std::move(comp)),
+          position_(std::move(pmap)) {}
+
+  // ----- size / state ----------------------------------------------------
+
+  [[nodiscard]] bool      empty() const noexcept { return heap_.empty(); }
+  [[nodiscard]] size_type size()  const noexcept { return heap_.size(); }
+
+  void reserve(size_type n) { heap_.reserve(n); }
+
+  /// Remove all entries. Resets each contained key's position to npos.
+  void clear() noexcept {
+    for (const auto& k : heap_) {
+      position_.set_position(k, npos);
+    }
+    heap_.clear();
+  }
+
+  // ----- queries ---------------------------------------------------------
+
+  /// O(1). Returns the key with the smallest distance under @c Compare.
+  /// Precondition: !empty().
+  [[nodiscard]] const Key& top() const noexcept { return heap_.front(); }
+
+  /// O(1). True iff @c k is currently in the heap.
+  [[nodiscard]] bool contains(const Key& k) const noexcept {
+    return position_.position(k) != npos;
+  }
+
+  // ----- modifiers -------------------------------------------------------
+
+  /// O(log_d N). Insert @c k. Behaviour is undefined if @c k is already
+  /// present — callers should use @c decrease() for re-insertions.
+  void push(const Key& k) {
+    const size_type i = heap_.size();
+    heap_.push_back(k);
+    position_.set_position(k, i);
+    sift_up_(i);
+  }
+
+  /// O(d · log_d N). Remove the top element.
+  /// Precondition: !empty().
+  void pop() {
+    const Key removed = heap_.front();
+    position_.set_position(removed, npos);
+
+    const size_type last = heap_.size() - 1;
+    if (last == 0) {
+      heap_.pop_back();
+      return;
+    }
+    // Move last → root, then sift down.
+    place_(0, heap_[last]);
+    heap_.pop_back();
+    sift_down_(0);
+  }
+
+  /// O(log_d N). Notify the heap that @c k's distance has decreased
+  /// (under @c Compare). Sifts @c k up only.
+  /// Precondition: contains(k).
+  void decrease(const Key& k) {
+    const size_type i = position_.position(k);
+    sift_up_(i);
+  }
+
+  /// Equivalent to @c push(k) if !contains(k), else @c decrease(k).
+  /// Convenience wrapper for the common Dijkstra relax pattern.
+  void push_or_decrease(const Key& k) {
+    const size_type i = position_.position(k);
+    if (i == npos) {
+      push(k);
+    } else {
+      sift_up_(i);
+    }
+  }
+
+  // ----- accessors (mostly for testing / introspection) ------------------
+
+  [[nodiscard]] const PositionMap& position_map_ref() const noexcept { return position_; }
+  [[nodiscard]] PositionMap&       position_map_ref()       noexcept { return position_; }
+
+private:
+  // -----------------------------------------------------------------------
+  // Heap topology helpers
+  // -----------------------------------------------------------------------
+
+  static constexpr size_type parent_of_(size_type i) noexcept {
+    return (i - 1) / Arity;
+  }
+  static constexpr size_type first_child_of_(size_type i) noexcept {
+    return Arity * i + 1;
+  }
+
+  /// Place @c k at index @c i and update the position map. Single point of
+  /// truth for `heap_[i] = k` — guarantees position_ stays consistent.
+  GRAPH_DETAIL_FORCE_INLINE
+  void place_(size_type i, const Key& k) {
+    heap_[i] = k;
+    position_.set_position(k, i);
+  }
+
+  /// Strict-less wrapper using the user's Compare on distances. Every sift
+  /// loop calls through here so that one inlining decision (this function)
+  /// is enough to collapse the entire comparator chain to a single compare
+  /// instruction — see GRAPH_DETAIL_FORCE_INLINE rationale above.
+  [[nodiscard]] GRAPH_DETAIL_FORCE_INLINE
+  bool less_than_(const Key& a, const Key& b) const {
+    return compare_(distance_(a), distance_(b));
+  }
+
+  // -----------------------------------------------------------------------
+  // Sift operations
+  //
+  // Implemented "hole-style": instead of swap-walking, we pull the original
+  // value out, walk the hole, then drop the value into its final slot. Saves
+  // one write per level vs. a naive swap loop.
+  // -----------------------------------------------------------------------
+
+  void sift_up_(size_type i) {
+    if (i == 0) return;
+    const Key k = heap_[i];
+    while (i > 0) {
+      const size_type p = parent_of_(i);
+      if (!less_than_(k, heap_[p])) {
+        break;
+      }
+      place_(i, heap_[p]); // move parent down into the hole
+      i = p;
+    }
+    place_(i, k);
+  }
+
+  void sift_down_(size_type i) {
+    const size_type n = heap_.size();
+    if (n == 0) return;
+    const Key k = heap_[i];
+
+    while (true) {
+      const size_type first = first_child_of_(i);
+      if (first >= n) break;
+
+      // Find the smallest child in [first, first + Arity).
+      const size_type last  = (first + Arity < n) ? first + Arity : n;
+      size_type       best  = first;
+      for (size_type c = first + 1; c < last; ++c) {
+        if (less_than_(heap_[c], heap_[best])) {
+          best = c;
+        }
+      }
+
+      if (!less_than_(heap_[best], k)) {
+        break; // k is no greater than its smallest child → done
+      }
+      place_(i, heap_[best]); // promote the smallest child into the hole
+      i = best;
+    }
+    place_(i, k);
+  }
+
+  std::vector<Key, Allocator> heap_;
+  DistanceFn                  distance_;
+  Compare                     compare_;
+  PositionMap                 position_;
+};
+
+} // namespace graph::detail
diff --git a/scripts/perf/README.md b/scripts/perf/README.md
new file mode 100644
index 0000000..6d1f0ff
--- /dev/null
+++ b/scripts/perf/README.md
@@ -0,0 +1,91 @@
+# scripts/perf - performance investigation tooling
+
+Helpers for running benchmarks, parsing VTune output, indexing exe symbols,
+and pulling targeted disassembly across both MSVC (`dumpbin`) and GCC
+(`objdump`). Built to support `agents/csr_edge_value_perf_plan.md` and
+`agents/thread_b_linux_runbook.md`.
+
+All scripts are stdlib-only (Python 3.10+).
+
+## Files
+
+| Script | OS | Purpose |
+|---|---|---|
+| `bench_run.py`        | both | Run a benchmark filter with core-pinning + High priority; emit median rows as JSON. |
+| `bench_compare.py`    | both | Diff two `bench_run.py` JSONs as a markdown delta table. |
+| `vtune_top.py`        | both | Parse a VTune CSV hotspots report; emit a normalized top-N. |
+| `sym_index.py`        | win  | Disk-cached `dumpbin /disasm:nobytes` parser. ~30s cold, ~0.5s warm. |
+| `find_func.py`        | win  | Symbol search wrapper around `sym_index`; supports `--regex`. |
+| `disasm_func.py`      | win  | Single-function disasm via `dumpbin /range:`. |
+| `capture_asm.py`      | win  | Bulk-dump a manifest of functions in one cache-warm pass. |
+| `objdump_capture.py`  | linux | Linux/GCC counterpart of `capture_asm.py` using `nm` + `objdump`. |
+| `linux_gcc_capture.sh`| linux | One-shot runbook driver: bench + perf-stat + objdump. |
+
+## Avoiding cmd-redirection of `<` and `>`
+
+Use `--regex` instead of `--pattern` for any filter that needs angle
+brackets. Even better, replace them with `.` wildcards so the arg is plain
+text (e.g. `use_indexed_dary_heap.4.` instead of `use_indexed_dary_heap<4>`).
+Both `disasm_func.py`, `find_func.py`, and the manifests in
+`agents/perf_capture_manifest*.txt` follow this convention.
+
+## Cache files
+
+`sym_index.py` writes `<exe>.symidx.json` next to the exe. Cache is
+invalidated automatically when the exe's size or mtime changes.
+`objdump_capture.py` does the same on Linux.
+
+## Workflow examples
+
+### One-shot bulk capture (used to populate `artifacts/perf/msvc_profile/`)
+
+```pwsh
+# From a vcvars64 shell.
+python scripts/perf/capture_asm.py `
+  --exe build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe `
+  --manifest agents/perf_capture_manifest.txt `
+  --out-dir artifacts/perf/msvc_profile
+```
+
+### Hotspot table from a VTune collection
+
+```pwsh
+& "C:\Program Files (x86)\Intel\oneAPI\vtune\latest\bin64\vtune.exe" `
+  -collect hotspots -knob sampling-mode=sw `
+  -result-dir vtune/hot_001 -- `
+  build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe `
+  --benchmark_filter="BM_Dijkstra_CSR_Grid_Idx4/100000" --benchmark_min_time=15s
+
+& "C:\Program Files (x86)\Intel\oneAPI\vtune\latest\bin64\vtune.exe" `
+  -report hotspots -r vtune/hot_001 -format csv > artifacts/perf/hot_001.csv
+
+python scripts/perf/vtune_top.py --csv artifacts/perf/hot_001.csv --top 15 --markdown
+```
+
+### Bench A vs bench B
+
+```pwsh
+python scripts/perf/bench_run.py --exe ... --filter ... `
+  --label baseline --out artifacts/perf/baseline.json
+# (apply change, rebuild)
+python scripts/perf/bench_run.py --exe ... --filter ... `
+  --label candidate --out artifacts/perf/candidate.json
+python scripts/perf/bench_compare.py `
+  --baseline artifacts/perf/baseline.json `
+  --candidate artifacts/perf/candidate.json `
+  --threshold 5
+```
+
+### Linux/WSL counterpart capture
+
+See `agents/thread_b_linux_runbook.md`. The one-liner is:
+
+```bash
+bash scripts/perf/linux_gcc_capture.sh
+```
+
+## Output convention
+
+All scripts write either to stdout or to `--out <path>`. The convention used
+in `agents/`-side docs is `artifacts/perf/<phase-or-toolchain>/...`.
+`artifacts/` is gitignored — these are working captures, not source-of-truth.
diff --git a/scripts/perf/__init__.py b/scripts/perf/__init__.py
new file mode 100644
index 0000000..7e17005
--- /dev/null
+++ b/scripts/perf/__init__.py
@@ -0,0 +1,6 @@
+# Performance investigation tooling.
+#
+# See scripts/perf/README.md for usage. These scripts orchestrate Google
+# Benchmark runs, parse VTune CSV exports, and target dumpbin output to
+# support the work documented in agents/csr_edge_value_perf_plan.md and
+# agents/indexed_dary_heap_results.md.
diff --git a/scripts/perf/bench_compare.py b/scripts/perf/bench_compare.py
new file mode 100644
index 0000000..93ff33e
--- /dev/null
+++ b/scripts/perf/bench_compare.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+"""
+bench_compare.py — diff two bench_run.py JSON outputs as a markdown table.
+
+Joins on (benchmark name, aggregate). Default aggregate is `median`.
+Emits a markdown table with absolute times and Δ%, plus regression / win flags.
+
+Example:
+    python scripts/perf/bench_compare.py \
+        --baseline artifacts/grid_ob2.json \
+        --candidate artifacts/grid_ob3.json \
+        --threshold 5
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Optional
+
+
+def _load(path: Path) -> dict:
+    return json.loads(path.read_text())
+
+
+def _index_rows(payload: dict, agg: str) -> dict[str, float]:
+    return {
+        r["name"]: r["real_time_ns"]
+        for r in payload["rows"]
+        if r["aggregate"] == agg
+    }
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--baseline", type=Path, required=True)
+    ap.add_argument("--candidate", type=Path, required=True)
+    ap.add_argument("--aggregate", default="median",
+                    choices=["mean", "median", "stddev", "cv"],
+                    help="Aggregate to compare (default median).")
+    ap.add_argument("--threshold", type=float, default=5.0,
+                    help="Δ%% above which to flag regression (⚠) or win (✅).")
+    ap.add_argument("--label-baseline", default=None,
+                    help="Override column header for baseline (default: from JSON).")
+    ap.add_argument("--label-candidate", default=None,
+                    help="Override column header for candidate.")
+    ap.add_argument("--out", type=Path, help="Write markdown to file (default stdout).")
+    args = ap.parse_args()
+
+    base = _load(args.baseline)
+    cand = _load(args.candidate)
+    base_rows = _index_rows(base, args.aggregate)
+    cand_rows = _index_rows(cand, args.aggregate)
+
+    label_base = args.label_baseline or base.get("label") or args.baseline.stem
+    label_cand = args.label_candidate or cand.get("label") or args.candidate.stem
+
+    # union of keys, sorted
+    keys = sorted(set(base_rows) | set(cand_rows))
+    if not keys:
+        raise SystemExit(f"no rows with aggregate={args.aggregate!r} in either file")
+
+    lines = [
+        f"| Benchmark | {label_base} (ns) | {label_cand} (ns) | Δ % |",
+        "|---|---:|---:|---:|",
+    ]
+    for k in keys:
+        b = base_rows.get(k)
+        c = cand_rows.get(k)
+        if b is None:
+            lines.append(f"| {k} | — | {c:,.0f} | new |")
+            continue
+        if c is None:
+            lines.append(f"| {k} | {b:,.0f} | — | dropped |")
+            continue
+        delta = (c - b) / b * 100.0
+        flag = ""
+        if delta >= args.threshold:
+            flag = " ⚠"
+        elif delta <= -args.threshold:
+            flag = " ✅"
+        lines.append(f"| {k} | {b:,.0f} | {c:,.0f} | {delta:+.1f} %{flag} |")
+
+    text = "\n".join(lines) + "\n"
+    if args.out:
+        args.out.parent.mkdir(parents=True, exist_ok=True)
+        args.out.write_text(text)
+        print(f"wrote table to {args.out}")
+    else:
+        print(text)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/perf/bench_run.py b/scripts/perf/bench_run.py
new file mode 100644
index 0000000..21a94ae
--- /dev/null
+++ b/scripts/perf/bench_run.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+"""
+bench_run.py — run a benchmark filter, capture median rows as JSON.
+
+Wraps the manual core-pin / priority-High / 5-rep / median pattern used
+throughout Phase 4.x perf work, and emits a structured result instead of
+PowerShell `Select-String "median"` plumbing.
+
+Example:
+    python scripts/perf/bench_run.py \
+        --exe build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe \
+        --filter "BM_Dijkstra_(CSR|BGL_CSR)_Grid(_Idx4)?/(10000|100000)$" \
+        --reps 5 --min-time 2s \
+        --out artifacts/bench_grid_msvc_profile.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import ctypes
+import json
+import re
+import subprocess
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Optional
+
+
+# Windows process-priority constants
+PRIORITY_HIGH = 0x00000080
+
+
+@dataclass
+class BenchRow:
+    name: str            # benchmark name, e.g. BM_Dijkstra_CSR_Grid_Idx4/100000
+    aggregate: str       # mean / median / stddev / cv
+    real_time_ns: float
+    cpu_time_ns: float
+    iterations: int
+
+
+# Lines look like:
+#   BM_Dijkstra_CSR_Grid_Idx4/100000_median   7440434 ns      7280759 ns         5
+# stddev rows have ns; cv rows have %.
+_ROW_RE = re.compile(
+    r"^(?P<name>BM_\S+?)_(?P<agg>mean|median|stddev|cv)\s+"
+    r"(?P<rt>\S+)\s+(?:ns|%)\s+(?P<cpu>\S+)\s+(?:ns|%)\s+(?P<iter>\d+)\s*$"
+)
+
+
+def parse_rows(text: str) -> list[BenchRow]:
+    rows: list[BenchRow] = []
+    for line in text.splitlines():
+        m = _ROW_RE.match(line)
+        if not m:
+            continue
+        try:
+            rows.append(
+                BenchRow(
+                    name=m.group("name"),
+                    aggregate=m.group("agg"),
+                    real_time_ns=float(m.group("rt")),
+                    cpu_time_ns=float(m.group("cpu")),
+                    iterations=int(m.group("iter")),
+                )
+            )
+        except ValueError:
+            # silently skip malformed
+            pass
+    return rows
+
+
+def _set_affinity_and_priority(pid: int, affinity_mask: int) -> None:
+    """Pin process to cores in `affinity_mask` and set HIGH priority. Windows-only."""
+    if sys.platform != "win32":
+        return
+    PROCESS_ALL_ACCESS = 0x1F0FFF
+    h = ctypes.windll.kernel32.OpenProcess(PROCESS_ALL_ACCESS, False, pid)
+    if not h:
+        print(f"warning: OpenProcess({pid}) failed", file=sys.stderr)
+        return
+    try:
+        if not ctypes.windll.kernel32.SetProcessAffinityMask(h, affinity_mask):
+            print(f"warning: SetProcessAffinityMask failed", file=sys.stderr)
+        if not ctypes.windll.kernel32.SetPriorityClass(h, PRIORITY_HIGH):
+            print(f"warning: SetPriorityClass failed", file=sys.stderr)
+    finally:
+        ctypes.windll.kernel32.CloseHandle(h)
+
+
+def run_benchmark(
+    exe: Path,
+    bench_filter: str,
+    reps: int,
+    min_time: str,
+    affinity_mask: int = 0x1,
+    aggregates_only: bool = True,
+    extra_args: Optional[list[str]] = None,
+) -> tuple[str, list[BenchRow]]:
+    args = [
+        str(exe),
+        f"--benchmark_filter={bench_filter}",
+        f"--benchmark_min_time={min_time}",
+        f"--benchmark_repetitions={reps}",
+    ]
+    if aggregates_only:
+        args.append("--benchmark_report_aggregates_only=true")
+    if extra_args:
+        args.extend(extra_args)
+
+    # Start suspended so we can pin before the first iteration. Easiest cross-version
+    # approach: start normally, immediately pin, then wait. The first ~ms of run loses
+    # the pin, but Google Benchmark's per-rep median and our 5-rep aggregate absorb it.
+    proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+    _set_affinity_and_priority(proc.pid, affinity_mask)
+    out, _ = proc.communicate()
+    if proc.returncode != 0:
+        print(out, file=sys.stderr)
+        raise SystemExit(f"benchmark exited with {proc.returncode}")
+    return out, parse_rows(out)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Run a benchmark filter and capture rows as JSON.")
+    ap.add_argument("--exe", type=Path, required=True, help="Path to benchmark executable.")
+    ap.add_argument("--filter", required=True, help="--benchmark_filter regex.")
+    ap.add_argument("--reps", type=int, default=5, help="--benchmark_repetitions (default 5).")
+    ap.add_argument("--min-time", default="2s", help="--benchmark_min_time (default 2s).")
+    ap.add_argument("--affinity", type=lambda s: int(s, 0), default=0x1,
+                    help="Process affinity mask (default 0x1 = core 0).")
+    ap.add_argument("--out", type=Path, help="Write JSON to this path (default: stdout).")
+    ap.add_argument("--label", default="", help="Free-form label stored in the JSON output.")
+    ap.add_argument("--print-stdout", action="store_true",
+                    help="Also print the raw benchmark stdout to this process's stderr.")
+    args = ap.parse_args()
+
+    if not args.exe.exists():
+        raise SystemExit(f"executable not found: {args.exe}")
+
+    raw, rows = run_benchmark(
+        args.exe,
+        args.filter,
+        args.reps,
+        args.min_time,
+        affinity_mask=args.affinity,
+    )
+    if args.print_stdout:
+        print(raw, file=sys.stderr)
+
+    payload = {
+        "label": args.label,
+        "exe": str(args.exe),
+        "filter": args.filter,
+        "reps": args.reps,
+        "min_time": args.min_time,
+        "affinity_mask": hex(args.affinity),
+        "rows": [asdict(r) for r in rows],
+    }
+    text = json.dumps(payload, indent=2)
+    if args.out:
+        args.out.parent.mkdir(parents=True, exist_ok=True)
+        args.out.write_text(text)
+        print(f"wrote {len(rows)} rows to {args.out}")
+    else:
+        print(text)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/perf/capture_asm.py b/scripts/perf/capture_asm.py
new file mode 100644
index 0000000..581ef32
--- /dev/null
+++ b/scripts/perf/capture_asm.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+"""capture_asm.py - bulk-dump a curated list of functions into one directory.
+
+Manifest format (one capture per non-blank, non-# line; whitespace-separated):
+
+    <basename>[:N]   <length_hex>   <regex>   [substring1 substring2 ...]
+
+  - basename:   output filename stem
+  - :N optional 0-based index (default 0) to disambiguate when the
+                regex+substrings still match more than one symbol
+  - length_hex: how many bytes to disassemble from the symbol's RVA
+  - regex:      Python re matched against the demangled symbol name
+                (use this for patterns containing < or >)
+  - substrings: AND-filtered after the regex match
+"""
+
+from __future__ import annotations
+
+import argparse
+import shlex
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from sym_index import disasm_range, filter_symbols, index_functions  # noqa: E402
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--exe", type=Path, required=True)
+    ap.add_argument("--manifest", type=Path, required=True)
+    ap.add_argument("--out-dir", type=Path, required=True)
+    ap.add_argument("--rebuild-cache", action="store_true")
+    args = ap.parse_args()
+
+    syms = index_functions(args.exe, force_rebuild=args.rebuild_cache)
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    n_ok = 0
+    n_skip = 0
+    for raw in args.manifest.read_text().splitlines():
+        line = raw.strip()
+        if not line or line.startswith("#"):
+            continue
+        parts = shlex.split(line)
+        if len(parts) < 3:
+            print(f"  skip (need basename, length, regex): {line}", file=sys.stderr)
+            n_skip += 1
+            continue
+
+        basename, length_str, first_regex, *rest = parts
+
+        pick = 0
+        if ":" in basename:
+            basename, pick_str = basename.rsplit(":", 1)
+            try:
+                pick = int(pick_str)
+            except ValueError:
+                print(f"  skip (bad :N suffix on {basename!r}): {line}", file=sys.stderr)
+                n_skip += 1
+                continue
+
+        try:
+            length = int(length_str, 0)
+        except ValueError:
+            print(f"  skip (bad length {length_str!r}): {line}", file=sys.stderr)
+            n_skip += 1
+            continue
+
+        matches = filter_symbols(syms, substrings=rest, regexes=[first_regex])
+        if not matches:
+            print(f"  no match: {basename}  (regex={first_regex!r} subs={rest!r})", file=sys.stderr)
+            n_skip += 1
+            continue
+        if pick >= len(matches):
+            print(f"  skip ({basename}: pick={pick} but only {len(matches)} matches)", file=sys.stderr)
+            n_skip += 1
+            continue
+        if len(matches) > 1 and pick == 0 and ":" not in raw.split()[0]:
+            short = matches[0].name if len(matches[0].name) <= 140 else matches[0].name[:140] + "..."
+            print(f"  note: {basename}: {len(matches)} matches; using [0] ({short})", file=sys.stderr)
+
+        sym = matches[pick]
+        asm = disasm_range(args.exe, sym.rva, sym.rva + length)
+        out_path = args.out_dir / f"{basename}.asm"
+        out_path.write_text(asm)
+        print(f"  OK  {basename:<32}  0x{sym.rva:x}  pick={pick}  {len(asm.splitlines())} lines")
+        n_ok += 1
+
+    print(f"\ncaptured {n_ok}, skipped {n_skip}", file=sys.stderr)
+    return 0 if n_skip == 0 else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/perf/disasm_func.py b/scripts/perf/disasm_func.py
new file mode 100644
index 0000000..e2e4f1e
--- /dev/null
+++ b/scripts/perf/disasm_func.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+"""
+disasm_func.py - disassemble one function from a Windows exe.
+
+Selects a function by --pattern (substring) and/or --regex (full Python re),
+then dumps just that function's bytes via dumpbin /range.
+
+The symbol index is cached on disk by sym_index.py, so repeated invocations
+on the same exe are near-instant after the first call.
+
+Avoid putting raw '<' or '>' on a Windows command line: cmd treats them as
+redirection. Use --regex with escaped angle brackets instead, e.g.:
+    --regex 'use_indexed_dary_heap<4>'
+
+Example:
+    python scripts/perf/disasm_func.py \
+        --exe build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe \
+        --regex 'use_indexed_dary_heap<4>' --pattern sift_down_ \
+        --out artifacts/perf/sift_down_idx4.asm
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from sym_index import (  # noqa: E402
+    disasm_range,
+    filter_symbols,
+    index_functions,
+)
+
+
+def _truncate(name: str, n: int) -> str:
+    return name if len(name) <= n else name[:n] + "\u2026"
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--exe", type=Path, required=True)
+    ap.add_argument("--pattern", action="append", default=[],
+                    help="substring filter (multiple = AND).")
+    ap.add_argument("--regex", action="append", default=[],
+                    help="Python regex filter (multiple = AND). Use this for patterns with angle brackets.")
+    ap.add_argument("--list-only", action="store_true")
+    ap.add_argument("--length", type=lambda s: int(s, 0), default=0x1000)
+    ap.add_argument("--match-index", type=int, default=0)
+    ap.add_argument("--rebuild-cache", action="store_true")
+    ap.add_argument("--out", type=Path)
+    ap.add_argument("--no-truncate", action="store_true")
+    args = ap.parse_args()
+
+    if not args.exe.exists():
+        raise SystemExit(f"exe not found: {args.exe}")
+    if not args.pattern and not args.regex:
+        raise SystemExit("need at least one --pattern or --regex")
+
+    syms = index_functions(args.exe, force_rebuild=args.rebuild_cache)
+    matches = filter_symbols(syms, args.pattern, args.regex)
+    if not matches:
+        sys.stderr.write(f"no symbols matched: patterns={args.pattern} regexes={args.regex}\n")
+        return 1
+
+    print(f"matches ({len(matches)} of {len(syms)} indexed):", file=sys.stderr)
+    for i, s in enumerate(matches[:30]):
+        name = s.name if args.no_truncate else _truncate(s.name, 200)
+        print(f"  [{i}] 0x{s.rva:x}  {name}", file=sys.stderr)
+    if len(matches) > 30:
+        print(f"  ... +{len(matches) - 30} more", file=sys.stderr)
+
+    if args.list_only:
+        return 0
+
+    if args.match_index >= len(matches):
+        raise SystemExit(f"--match-index {args.match_index} out of range (have {len(matches)})")
+    sym = matches[args.match_index]
+    asm = disasm_range(args.exe, sym.rva, sym.rva + args.length)
+    if args.out:
+        args.out.parent.mkdir(parents=True, exist_ok=True)
+        args.out.write_text(asm)
+        print(f"wrote {len(asm.splitlines())} lines to {args.out}", file=sys.stderr)
+    else:
+        print(asm)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/perf/find_func.py b/scripts/perf/find_func.py
new file mode 100644
index 0000000..6846000
--- /dev/null
+++ b/scripts/perf/find_func.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+"""find_func.py - search the cached symbol index by substring(s) and/or regex(es).
+
+Light wrapper for sym_index. Use --regex when the pattern needs angle
+brackets (cmd treats < and > as redirection).
+
+Example:
+    python scripts/perf/find_func.py \
+        --exe build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe \
+        --pattern compressed_graph --pattern lambda_2 \
+        --regex 'use_indexed_dary_heap<4>'
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from sym_index import filter_symbols, index_functions  # noqa: E402
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--exe", type=Path, required=True)
+    ap.add_argument("--pattern", action="append", default=[])
+    ap.add_argument("--regex", action="append", default=[])
+    ap.add_argument("--limit", type=int, default=20)
+    ap.add_argument("--no-truncate", action="store_true")
+    ap.add_argument("--rebuild-cache", action="store_true")
+    args = ap.parse_args()
+
+    if not args.pattern and not args.regex:
+        raise SystemExit("need at least one --pattern or --regex")
+
+    syms = index_functions(args.exe, force_rebuild=args.rebuild_cache)
+    matches = filter_symbols(syms, args.pattern, args.regex)
+    print(f"{len(matches)} match(es) of {len(syms)} indexed:")
+    for i, s in enumerate(matches[: args.limit]):
+        name = s.name if args.no_truncate else (s.name if len(s.name) <= 200 else s.name[:200] + "\u2026")
+        print(f"  [{i}] 0x{s.rva:x}  {name}")
+    if len(matches) > args.limit:
+        print(f"  ... +{len(matches) - args.limit} more")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/perf/linux_gcc_capture.sh b/scripts/perf/linux_gcc_capture.sh
new file mode 100644
index 0000000..7083516
--- /dev/null
+++ b/scripts/perf/linux_gcc_capture.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# linux_gcc_capture.sh - capture the Linux/GCC counterpart of the MSVC reference
+# in artifacts/perf/msvc_profile/.
+#
+# WSL has no hardware performance counters (PMU is not exposed), so this
+# script intentionally avoids `perf stat -e cache-misses,...` and uses only:
+#   - wall-clock (Google Benchmark median across 5 reps)
+#   - software perf events that work in WSL
+#   - objdump for per-symbol disassembly comparison
+#
+# Run from the workspace root after cloning to a Linux/WSL machine:
+#
+#   cmake --preset linux-gcc-release
+#   cmake --build --preset linux-gcc-release
+#   bash scripts/perf/linux_gcc_capture.sh
+#
+# Output lands in artifacts/perf/linux_gcc/, mirroring artifacts/perf/msvc_profile/.
+
+set -euo pipefail
+
+EXE="${1:-build/linux-gcc-release/benchmark/algorithms/benchmark_dijkstra}"
+OUT_DIR="${2:-artifacts/perf/linux_gcc}"
+
+if [[ ! -x "$EXE" ]]; then
+  echo "ERROR: benchmark exe not found or not executable: $EXE" >&2
+  echo "Usage: $0 [path/to/benchmark_dijkstra] [out_dir]" >&2
+  exit 2
+fi
+
+mkdir -p "$OUT_DIR"
+echo "==> capturing Linux/GCC reference into $OUT_DIR"
+
+# ---------- 1. wall-clock baseline ----------
+echo "--- 1. wall-clock baseline (5 reps, median, taskset core 4) ---"
+taskset -c 4 python3 scripts/perf/bench_run.py \
+  --exe "$EXE" \
+  --filter 'BM_Dijkstra_(CSR|BGL_CSR)_(ER_Sparse|Grid|BA|Path)(_Idx4)?/(10000|100000)$' \
+  --reps 5 --min-time 2s \
+  --label "linux-gcc-release" \
+  --out "$OUT_DIR/wallclock_baseline.json"
+
+# ---------- 2. software perf-stat counters (WSL-friendly) ----------
+# These software events do NOT need the PMU; they work in WSL.
+SW_EVENTS="task-clock,context-switches,page-faults,cpu-migrations,instructions:u,cycles:u"
+echo "--- 2. perf stat (software events, no PMU required) ---"
+for bench in BM_Dijkstra_CSR_Grid_Idx4/100000 BM_Dijkstra_BGL_CSR_Grid/100000 \
+             BM_Dijkstra_CSR_Path_Idx4/100000 BM_Dijkstra_BGL_CSR_Path/100000; do
+  safe="${bench//\//_}"
+  echo "  perf stat $bench"
+  taskset -c 4 perf stat -e "$SW_EVENTS" -r 3 -- \
+    "$EXE" --benchmark_filter="^${bench}$" --benchmark_min_time=3s \
+    > "$OUT_DIR/perfstat_${safe}.stdout" \
+    2> "$OUT_DIR/perfstat_${safe}.stderr" || \
+    echo "    note: perf stat returned non-zero for $bench (may indicate no PMU)" >&2
+done
+
+# ---------- 3. objdump per-symbol captures ----------
+# GCC's objdump does the demangling MSVC's dumpbin does, but with --demangle.
+echo "--- 3. objdump captures (mirrors MSVC manifest) ---"
+python3 scripts/perf/objdump_capture.py \
+  --exe "$EXE" \
+  --manifest agents/perf_capture_manifest_linux.txt \
+  --out-dir "$OUT_DIR"
+
+echo
+echo "==> Linux capture complete. Diff against MSVC with:"
+echo "    python scripts/perf/bench_compare.py \\"
+echo "      --baseline artifacts/perf/msvc_profile/wallclock_baseline.json \\"
+echo "      --candidate $OUT_DIR/wallclock_baseline.json \\"
+echo "      --label-baseline msvc --label-candidate gcc"
diff --git a/scripts/perf/objdump_capture.py b/scripts/perf/objdump_capture.py
new file mode 100644
index 0000000..44d9b3e
--- /dev/null
+++ b/scripts/perf/objdump_capture.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+"""
+objdump_capture.py - Linux/GCC counterpart of capture_asm.py.
+
+Same manifest format as capture_asm.py, but uses `objdump` instead of
+`dumpbin` and operates on demangled C++ symbol names from `nm --demangle`.
+
+Required tools (Linux/WSL):
+  - objdump  (binutils)
+  - nm       (binutils)
+
+Manifest format (one capture per non-blank, non-# line):
+
+    <basename>[:N]   <length_hex>   <regex>   [substring1 substring2 ...]
+
+The Linux manifest (agents/perf_capture_manifest_linux.txt) parallels the
+MSVC one but accounts for:
+  - Itanium mangling vs MSVC mangling (e.g. `_Z...` vs `??...`)
+  - GCC's tendency to emit `.cold` partitions (fold them into the main body
+    by extending the --length)
+  - BGL's d_ary_heap_indirect inlines preserve_heap_property_down on GCC,
+    so its capture often returns `<inlined>` only.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import shlex
+import shutil
+import subprocess
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+
+
+@dataclass
+class Symbol:
+    name: str
+    addr: int
+    size: int = 0
+
+
+def _which(tool: str) -> str:
+    p = shutil.which(tool)
+    if not p:
+        raise SystemExit(f"{tool} not on PATH (need binutils on Linux/WSL)")
+    return p
+
+
+def _cache_path(exe: Path) -> Path:
+    return exe.with_suffix(exe.suffix + ".symidx.json")
+
+
+def _exe_fingerprint(exe: Path) -> dict:
+    st = exe.stat()
+    return {"path": str(exe), "size": st.st_size, "mtime_ns": st.st_mtime_ns}
+
+
+# nm output line format with --demangle and -S (sizes):
+#   0000000000003fa0 0000000000000123 T graph::dijkstra_shortest_paths<...>(...)
+_NM_RE = re.compile(
+    r"^(?P<addr>[0-9a-f]+)\s+(?P<size>[0-9a-f]+)\s+(?P<type>[A-Za-z])\s+(?P<name>.+)$"
+)
+
+
+def index_symbols(exe: Path, *, force_rebuild: bool = False) -> list[Symbol]:
+    cache = _cache_path(exe)
+    if not force_rebuild and cache.exists():
+        try:
+            payload = json.loads(cache.read_text())
+            if payload.get("fingerprint") == _exe_fingerprint(exe):
+                return [Symbol(**s) for s in payload["symbols"]]
+        except (OSError, json.JSONDecodeError, KeyError):
+            pass
+
+    nm = _which("nm")
+    print(f"indexing functions in {exe.name} via nm --demangle ...", file=sys.stderr)
+    proc = subprocess.run(
+        [nm, "--demangle", "--print-size", "--defined-only",
+         "--no-sort", str(exe)],
+        capture_output=True, text=True, errors="replace",
+    )
+    if proc.returncode != 0:
+        sys.stderr.write(proc.stderr)
+        raise SystemExit(f"nm failed ({proc.returncode})")
+
+    syms: list[Symbol] = []
+    for line in proc.stdout.splitlines():
+        m = _NM_RE.match(line)
+        if not m:
+            continue
+        if m.group("type").lower() not in ("t", "w"):  # text or weak text
+            continue
+        try:
+            syms.append(Symbol(
+                name=m.group("name"),
+                addr=int(m.group("addr"), 16),
+                size=int(m.group("size"), 16),
+            ))
+        except ValueError:
+            pass
+
+    print(f"  indexed {len(syms)} text symbols", file=sys.stderr)
+    try:
+        cache.write_text(json.dumps({
+            "fingerprint": _exe_fingerprint(exe),
+            "symbols": [asdict(s) for s in syms],
+        }))
+    except OSError as e:
+        print(f"  warning: failed to write cache: {e}", file=sys.stderr)
+    return syms
+
+
+def filter_symbols(
+    syms,
+    substrings=(),
+    regexes=(),
+):
+    sub = list(substrings)
+    rxs = [re.compile(r) for r in regexes]
+    out = []
+    for s in syms:
+        if all(p in s.name for p in sub) and all(rx.search(s.name) for rx in rxs):
+            out.append(s)
+    return out
+
+
+def disasm_range(exe: Path, start: int, end: int) -> str:
+    objdump = _which("objdump")
+    proc = subprocess.run(
+        [objdump, "-d", "--demangle", "--no-show-raw-insn",
+         f"--start-address=0x{start:x}", f"--stop-address=0x{end:x}",
+         str(exe)],
+        capture_output=True, text=True, errors="replace",
+    )
+    if proc.returncode != 0:
+        sys.stderr.write(proc.stderr)
+        raise SystemExit(f"objdump failed ({proc.returncode})")
+    return proc.stdout
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--exe", type=Path, required=True)
+    ap.add_argument("--manifest", type=Path, required=True)
+    ap.add_argument("--out-dir", type=Path, required=True)
+    ap.add_argument("--rebuild-cache", action="store_true")
+    args = ap.parse_args()
+
+    syms = index_symbols(args.exe, force_rebuild=args.rebuild_cache)
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    n_ok = n_skip = 0
+    for raw in args.manifest.read_text().splitlines():
+        line = raw.strip()
+        if not line or line.startswith("#"):
+            continue
+        parts = shlex.split(line)
+        if len(parts) < 3:
+            print(f"  skip (need basename, length, regex): {line}", file=sys.stderr)
+            n_skip += 1
+            continue
+
+        basename, length_str, first_regex, *rest = parts
+        pick = 0
+        if ":" in basename:
+            basename, pick_str = basename.rsplit(":", 1)
+            try:
+                pick = int(pick_str)
+            except ValueError:
+                print(f"  skip (bad :N suffix on {basename!r})", file=sys.stderr)
+                n_skip += 1
+                continue
+        try:
+            length = int(length_str, 0)
+        except ValueError:
+            print(f"  skip (bad length {length_str!r})", file=sys.stderr)
+            n_skip += 1
+            continue
+
+        matches = filter_symbols(syms, substrings=rest, regexes=[first_regex])
+        if not matches:
+            print(f"  no match: {basename}  (regex={first_regex!r} subs={rest!r})", file=sys.stderr)
+            n_skip += 1
+            continue
+        if pick >= len(matches):
+            print(f"  skip ({basename}: pick={pick} but only {len(matches)} matches)", file=sys.stderr)
+            n_skip += 1
+            continue
+
+        sym = matches[pick]
+        # Use nm-reported size when available; else fall back to manifest length.
+        end = sym.addr + (sym.size if sym.size else length)
+        asm = disasm_range(args.exe, sym.addr, end)
+        out_path = args.out_dir / f"{basename}.asm"
+        out_path.write_text(asm)
+        print(f"  OK  {basename:<32}  0x{sym.addr:x}  pick={pick}  "
+              f"size={sym.size}  {len(asm.splitlines())} lines")
+        n_ok += 1
+
+    print(f"\ncaptured {n_ok}, skipped {n_skip}", file=sys.stderr)
+    return 0 if n_skip == 0 else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/perf/sym_index.py b/scripts/perf/sym_index.py
new file mode 100644
index 0000000..cc37956
--- /dev/null
+++ b/scripts/perf/sym_index.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+"""
+sym_index.py — disk-cached symbol index for a Windows exe.
+
+`dumpbin /disasm:nobytes` on a 1.4 MB benchmark takes ~30 s and returns
+14k+ function entries. We need to query that table dozens of times during
+a perf investigation; caching the parse result to a JSON file next to
+the exe drops repeated lookups to <100 ms.
+
+Cache invalidation: by exe mtime+size in the cache header.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+import shutil
+import subprocess
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Iterable
+
+
+@dataclass
+class Symbol:
+    name: str
+    rva: int  # virtual address relative to image base
+
+
+_FUNC_HEADER_RE = re.compile(r"^(?P<name>[^\s].*?):\s*$")
+_ADDR_RE        = re.compile(r"^\s*(?P<addr>[0-9A-Fa-f]{8,16}):\s")
+
+
+def _find_dumpbin() -> Path:
+    p = shutil.which("dumpbin")
+    if not p:
+        raise SystemExit(
+            "dumpbin not on PATH \u2014 run from a vcvars64 shell."
+        )
+    return Path(p)
+
+
+def _parse_dumpbin_output(text: str) -> list[Symbol]:
+    syms: list[Symbol] = []
+    pending: str | None = None
+    for line in text.splitlines():
+        if not line:
+            pending = None
+            continue
+        if pending is None:
+            if _ADDR_RE.match(line):
+                continue
+            m = _FUNC_HEADER_RE.match(line)
+            if not m:
+                continue
+            cand = m.group("name").strip()
+            if cand.startswith("Dump of"):
+                continue
+            head = cand.split()[0] if cand else ""
+            if "0x" in head:
+                continue
+            pending = cand
+            continue
+        m = _ADDR_RE.match(line)
+        if m:
+            try:
+                syms.append(Symbol(name=pending, rva=int(m.group("addr"), 16)))
+            except ValueError:
+                pass
+            pending = None
+    return syms
+
+
+def _cache_path(exe: Path) -> Path:
+    return exe.with_suffix(exe.suffix + ".symidx.json")
+
+
+def _exe_fingerprint(exe: Path) -> dict:
+    st = exe.stat()
+    return {"path": str(exe), "size": st.st_size, "mtime_ns": st.st_mtime_ns}
+
+
+def _read_cache(exe: Path) -> list[Symbol] | None:
+    cache = _cache_path(exe)
+    if not cache.exists():
+        return None
+    try:
+        payload = json.loads(cache.read_text())
+    except (OSError, json.JSONDecodeError):
+        return None
+    if payload.get("fingerprint") != _exe_fingerprint(exe):
+        return None
+    return [Symbol(**s) for s in payload.get("symbols", [])]
+
+
+def _write_cache(exe: Path, syms: list[Symbol]) -> None:
+    cache = _cache_path(exe)
+    payload = {
+        "fingerprint": _exe_fingerprint(exe),
+        "symbols": [asdict(s) for s in syms],
+    }
+    try:
+        cache.write_text(json.dumps(payload))
+    except OSError as e:
+        print(f"warning: failed to write {cache}: {e}", file=sys.stderr)
+
+
+def index_functions(exe: Path, *, force_rebuild: bool = False) -> list[Symbol]:
+    """Return the function-entry list for `exe`, caching the result on disk."""
+    if not force_rebuild:
+        cached = _read_cache(exe)
+        if cached is not None:
+            return cached
+
+    dumpbin = _find_dumpbin()
+    print(f"indexing functions in {exe.name} (one-time, ~30s) ...", file=sys.stderr)
+    proc = subprocess.run(
+        [str(dumpbin), "/disasm:nobytes", "/nologo", str(exe)],
+        capture_output=True, text=True, errors="replace",
+    )
+    if proc.returncode != 0:
+        sys.stderr.write(proc.stderr)
+        raise SystemExit(f"dumpbin failed ({proc.returncode})")
+
+    syms = _parse_dumpbin_output(proc.stdout)
+    print(f"  indexed {len(syms)} functions; cached to {_cache_path(exe).name}", file=sys.stderr)
+    _write_cache(exe, syms)
+    return syms
+
+
+def filter_symbols(
+    syms: Iterable[Symbol],
+    substrings: Iterable[str] = (),
+    regexes: Iterable[str] = (),
+    *,
+    include_ilt_thunks: bool = False,
+) -> list[Symbol]:
+    """Return symbols matching ALL substrings AND ALL regexes.
+
+    @ILT+... entries (incremental linker thunks - small forwarders, not real
+    bodies) are skipped by default; pass include_ilt_thunks=True to keep them.
+    """
+    sub = list(substrings)
+    rxs = [re.compile(r) for r in regexes]
+    out: list[Symbol] = []
+    for s in syms:
+        if not include_ilt_thunks and s.name.startswith("@ILT"):
+            continue
+        if all(p in s.name for p in sub) and all(rx.search(s.name) for rx in rxs):
+            out.append(s)
+    return out
+
+
+def disasm_range(exe: Path, start: int, end: int) -> str:
+    dumpbin = _find_dumpbin()
+    proc = subprocess.run(
+        [str(dumpbin), "/disasm", "/nologo",
+         f"/range:0x{start:x},0x{end:x}", str(exe)],
+        capture_output=True, text=True, errors="replace",
+    )
+    if proc.returncode != 0:
+        sys.stderr.write(proc.stderr)
+        raise SystemExit(f"dumpbin /range failed ({proc.returncode})")
+    return proc.stdout
diff --git a/scripts/perf/vtune_top.py b/scripts/perf/vtune_top.py
new file mode 100644
index 0000000..924c41f
--- /dev/null
+++ b/scripts/perf/vtune_top.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""
+vtune_top.py — read a VTune `-format csv` hotspots report, emit clean top-N.
+
+Replaces the brittle PowerShell parsing used in earlier Phase 4.x runs:
+    $vtune -report hotspots -r <dir> -format csv | <regex juggling>
+
+Symbol normalization rules:
+  graph::detail::indexed_dary_heap<...>::F   → heap::F
+  graph::container_value_fn<...>::F          → cfn::F
+  graph::detail::vector_position_map::F      → pm::F
+  std::less<...>::F                          → less::F
+  std::vector<T,Alloc>::F                    → vector<T>::F   (drops the Alloc)
+  std::_Vector_iterator<...>::F              → _Vector_iter::F
+  graph::views::incidence_view<...>::F       → incidence_view::F
+
+Example:
+    vtune.exe -report hotspots -r vtune/hot_001 -format csv > hot.csv
+    python scripts/perf/vtune_top.py --csv hot.csv --top 15
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import re
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+
+@dataclass
+class Hotspot:
+    function: str
+    cpu_time: float
+    module: str = ""
+    source: str = ""
+
+
+def _strip_template(name: str) -> str:
+    """Remove balanced angle-brackets from `name`."""
+    out: list[str] = []
+    depth = 0
+    for ch in name:
+        if ch == "<":
+            depth += 1
+            continue
+        if ch == ">":
+            depth = max(0, depth - 1)
+            continue
+        if depth == 0:
+            out.append(ch)
+    return "".join(out)
+
+
+_NORMALIZERS: list[tuple[re.Pattern[str], str]] = [
+    (re.compile(r"graph::detail::indexed_dary_heap<.*?>::"),       "heap::"),
+    (re.compile(r"graph::container_value_fn<.*?>::"),              "cfn::"),
+    (re.compile(r"graph::detail::vector_position_map::"),          "pm::"),
+    (re.compile(r"std::less<.*?>::"),                              "less::"),
+    (re.compile(r"std::_Vector_iterator<.*?>::"),                  "_Vector_iter::"),
+    (re.compile(r"graph::views::incidence_view<.*?>::"),           "incidence_view::"),
+    (re.compile(r"std::vector<([^,>]+),.*?>::"),                   r"vector<\1>::"),
+]
+
+
+def normalize(name: str) -> str:
+    s = name
+    for rx, repl in _NORMALIZERS:
+        s = rx.sub(repl, s)
+    # Trim crazy-long template instantiations on the bare end ("X<...>") that no rule matched.
+    if "<" in s and len(s) > 120:
+        s = _strip_template(s) + "<...>"
+    return s.strip()
+
+
+def load_csv(path: Path) -> list[Hotspot]:
+    spots: list[Hotspot] = []
+    # VTune CSV is tab-delimited despite the name.
+    with path.open(newline="", encoding="utf-8", errors="replace") as f:
+        reader = csv.reader(f, delimiter="\t")
+        header: Optional[list[str]] = None
+        for row in reader:
+            if not row or row[0].lower().startswith(("function", "vtune")):
+                if row and row[0].lower() == "function":
+                    header = row
+                continue
+            if len(row) < 2:
+                continue
+            try:
+                cpu = float(row[1])
+            except ValueError:
+                continue
+            module = row[5] if len(row) > 5 else ""
+            source = row[7] if len(row) > 7 else ""
+            spots.append(Hotspot(function=row[0], cpu_time=cpu, module=module, source=source))
+    return spots
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--csv", type=Path, required=True, help="VTune hotspots CSV (tab-separated).")
+    ap.add_argument("--top", type=int, default=15, help="Rows to show (default 15).")
+    ap.add_argument("--no-normalize", action="store_true",
+                    help="Skip the heap::/cfn::/etc. normalization rules.")
+    ap.add_argument("--markdown", action="store_true",
+                    help="Emit a markdown table instead of plain text.")
+    args = ap.parse_args()
+
+    spots = load_csv(args.csv)
+    if not spots:
+        print(f"no rows parsed from {args.csv}", file=sys.stderr)
+        return 1
+
+    total = sum(s.cpu_time for s in spots)
+    spots.sort(key=lambda s: s.cpu_time, reverse=True)
+    top = spots[: args.top]
+
+    if args.markdown:
+        print(f"Total CPU collected: **{total:.2f} s** across {len(spots)} symbols\n")
+        print("| Rank | Function | CPU (s) | % |")
+        print("|---:|---|---:|---:|")
+        for i, s in enumerate(top, 1):
+            name = s.function if args.no_normalize else normalize(s.function)
+            pct = s.cpu_time / total * 100 if total else 0
+            print(f"| {i} | `{name}` | {s.cpu_time:.3f} | {pct:.1f} |")
+    else:
+        print(f"Total CPU: {total:.2f} s across {len(spots)} symbols")
+        for i, s in enumerate(top, 1):
+            name = s.function if args.no_normalize else normalize(s.function)
+            pct = s.cpu_time / total * 100 if total else 0
+            short = (name[:75] + "…") if len(name) > 76 else name
+            print(f"  {i:2d}. {pct:5.1f} %  {s.cpu_time:7.3f}s  {short}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/algorithms/CMakeLists.txt b/tests/algorithms/CMakeLists.txt
index 657e2ed..5126358 100644
--- a/tests/algorithms/CMakeLists.txt
+++ b/tests/algorithms/CMakeLists.txt
@@ -18,6 +18,8 @@ add_executable(test_algorithms
     test_jaccard.cpp
     test_scc_bidirectional.cpp
     test_tarjan_scc.cpp
+    test_indexed_dary_heap.cpp
+    test_dijkstra_indexed_heap.cpp
 )
 
 target_link_libraries(test_algorithms
diff --git a/tests/algorithms/test_dijkstra_indexed_heap.cpp b/tests/algorithms/test_dijkstra_indexed_heap.cpp
new file mode 100644
index 0000000..26c7742
--- /dev/null
+++ b/tests/algorithms/test_dijkstra_indexed_heap.cpp
@@ -0,0 +1,483 @@
+/**
+ * @file test_dijkstra_indexed_heap.cpp
+ * @brief Phase 2 tests for the indexed d-ary heap path of dijkstra_shortest_paths.
+ *
+ * These tests:
+ *   1. Re-run core Dijkstra scenarios with `use_indexed_dary_heap<>` and
+ *      assert identical distances/predecessors as the default-heap path.
+ *   2. Audit visitor call counts: examine, finish, edge-relaxed, and
+ *      edge-not-relaxed events must match between the two heap paths
+ *      (Phase 2.3 visitor-semantics audit).
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/catch_template_test_macros.hpp>
+#include <graph/algorithm/dijkstra_shortest_paths.hpp>
+#include "../common/graph_fixtures.hpp"
+#include "../common/algorithm_test_types.hpp"
+#include "../common/map_graph_fixtures.hpp"
+#include <graph/adj_list/vertex_property_map.hpp>
+#include <graph/container/dynamic_graph.hpp>
+#include <graph/container/traits/mov_graph_traits.hpp>
+
+#include <string>
+#include <vector>
+
+using namespace graph;
+using namespace graph::adj_list;
+using namespace graph::test;
+using namespace graph::test::fixtures;
+using namespace graph::test::algorithm;
+
+namespace {
+
+// Visitor that records exact call counts for every event Dijkstra fires.
+struct CountingVisitor {
+  int initialize = 0;
+  int discover   = 0;
+  int examine    = 0;
+  int finish     = 0;
+  int relaxed    = 0;
+  int not_relaxed = 0;
+
+  template <typename G, typename V> void on_initialize_vertex(const G&, const V&) { ++initialize; }
+  template <typename G, typename V> void on_discover_vertex (const G&, const V&) { ++discover;  }
+  template <typename G, typename V> void on_examine_vertex  (const G&, const V&) { ++examine;   }
+  template <typename G, typename V> void on_finish_vertex   (const G&, const V&) { ++finish;    }
+  template <typename G, typename E> void on_edge_relaxed    (const G&, const E&) { ++relaxed;   }
+  template <typename G, typename E> void on_edge_not_relaxed(const G&, const E&) { ++not_relaxed; }
+};
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// Correctness: indexed heap produces the same distances as the default heap
+// ---------------------------------------------------------------------------
+
+TEST_CASE("dijkstra(indexed_heap) - CLRS example matches default heap",
+          "[algorithm][dijkstra][indexed_heap]") {
+  using Graph = vov_weighted;
+
+  auto                            g = clrs_dijkstra_graph<Graph>();
+  std::vector<int>                distance(num_vertices(g));
+  std::vector<vertex_id_t<Graph>> predecessor(num_vertices(g));
+  init_shortest_paths(g, distance, predecessor);
+
+  dijkstra_shortest_paths(g, vertex_id_t<Graph>(0),
+                          container_value_fn(distance),
+                          container_value_fn(predecessor),
+                          [](const auto& gr, const auto& uv) { return edge_value(gr, uv); },
+                          empty_visitor{},
+                          std::less<int>{},
+                          std::plus<int>{},
+                          use_indexed_dary_heap<4>{},
+                          std::allocator<std::byte>{});
+
+  for (size_t i = 0; i < clrs_dijkstra_results::distances_from_0.size(); ++i) {
+    CHECK(distance[i] == clrs_dijkstra_results::distances_from_0[i]);
+  }
+}
+
+TEST_CASE("dijkstra(indexed_heap) - path graph", "[algorithm][dijkstra][indexed_heap]") {
+  using Graph = vov_weighted;
+
+  auto                            g = path_graph_4_weighted<Graph>();
+  std::vector<int>                distance(num_vertices(g));
+  std::vector<vertex_id_t<Graph>> predecessor(num_vertices(g));
+  init_shortest_paths(g, distance, predecessor);
+
+  dijkstra_shortest_paths(g, vertex_id_t<Graph>(0),
+                          container_value_fn(distance),
+                          container_value_fn(predecessor),
+                          [](const auto& gr, const auto& uv) { return edge_value(gr, uv); },
+                          empty_visitor{},
+                          std::less<int>{},
+                          std::plus<int>{},
+                          use_indexed_dary_heap<>{},
+                          std::allocator<std::byte>{});
+
+  for (size_t i = 0; i < path_graph_4_results::num_vertices; ++i) {
+    CHECK(distance[i] == path_graph_4_results::distances[i]);
+  }
+}
+
+TEST_CASE("dijkstra(indexed_heap) - multi-source CLRS", "[algorithm][dijkstra][indexed_heap]") {
+  using Graph = vov_weighted;
+
+  auto             g = clrs_dijkstra_graph<Graph>();
+  std::vector<int> distance(num_vertices(g));
+  std::vector<vertex_id_t<Graph>> predecessor(num_vertices(g));
+  init_shortest_paths(g, distance, predecessor);
+
+  std::vector<vertex_id_t<Graph>> sources = {0, 3};
+
+  dijkstra_shortest_paths(g, sources,
+                          container_value_fn(distance),
+                          container_value_fn(predecessor),
+                          [](const auto& gr, const auto& uv) { return edge_value(gr, uv); },
+                          empty_visitor{},
+                          std::less<int>{},
+                          std::plus<int>{},
+                          use_indexed_dary_heap<>{},
+                          std::allocator<std::byte>{});
+
+  // Both sources start at distance 0; every vertex is reachable.
+  CHECK(distance[0] == 0);
+  CHECK(distance[3] == 0);
+  for (auto d : distance) CHECK(d != infinite_distance<int>());
+}
+
+// Resolves Open Question 4: confirm that with multi-source seeding, the
+// indexed-heap path fires on_examine_vertex / on_finish_vertex / on_discover
+// the same number of times as the default-heap path. With non-negative
+// weights, every source is pushed at distance 0 and finalized on its first
+// pop (no later relax can lower distance below 0), so the visitor sees each
+// vertex exactly once on both paths.
+TEST_CASE("dijkstra(indexed_heap) - multi-source visitor parity vs default heap",
+          "[algorithm][dijkstra][indexed_heap]") {
+  using Graph = vov_weighted;
+  auto g = clrs_dijkstra_graph<Graph>();
+  const auto N = num_vertices(g);
+  std::vector<vertex_id_t<Graph>> sources = {0, 3};
+
+  auto run = [&](auto heap_tag) {
+    std::vector<int>                distance(N);
+    std::vector<vertex_id_t<Graph>> predecessor(N);
+    init_shortest_paths(g, distance, predecessor);
+    CountingVisitor v{};
+    dijkstra_shortest_paths(g, sources,
+                            container_value_fn(distance),
+                            container_value_fn(predecessor),
+                            [](const auto& gr, const auto& uv) { return edge_value(gr, uv); },
+                            v,
+                            std::less<int>{},
+                            std::plus<int>{},
+                            heap_tag);
+    return std::tuple{v, distance};
+  };
+
+  auto [v_def,  d_def ] = run(use_default_heap{});
+  auto [v_idx4, d_idx4] = run(use_indexed_dary_heap<4>{});
+  auto [v_idx8, d_idx8] = run(use_indexed_dary_heap<8>{});
+
+  // Distances must agree (sanity).
+  CHECK(d_def == d_idx4);
+  CHECK(d_def == d_idx8);
+
+  // Visitor parity. Each vertex examined / finished exactly once; each
+  // vertex (incl. each source) discovered exactly once.
+  CHECK(v_def.examine  == static_cast<int>(N));
+  CHECK(v_def.finish   == static_cast<int>(N));
+  CHECK(v_def.discover == static_cast<int>(N));
+
+  CHECK(v_idx4.examine  == v_def.examine);
+  CHECK(v_idx4.finish   == v_def.finish);
+  CHECK(v_idx4.discover == v_def.discover);
+  CHECK(v_idx4.relaxed  == v_def.relaxed);
+  CHECK(v_idx4.not_relaxed == v_def.not_relaxed);
+
+  CHECK(v_idx8.examine  == v_def.examine);
+  CHECK(v_idx8.finish   == v_def.finish);
+  CHECK(v_idx8.discover == v_def.discover);
+  CHECK(v_idx8.relaxed  == v_def.relaxed);
+  CHECK(v_idx8.not_relaxed == v_def.not_relaxed);
+}
+
+TEST_CASE("dijkstra(indexed_heap) - distances-only overload",
+          "[algorithm][dijkstra][indexed_heap]") {
+  using Graph = vov_weighted;
+
+  auto             g = clrs_dijkstra_graph<Graph>();
+  std::vector<int> distance(num_vertices(g));
+  init_shortest_paths(g, distance);
+
+  dijkstra_shortest_distances(g, vertex_id_t<Graph>(0),
+                              container_value_fn(distance),
+                              [](const auto& gr, const auto& uv) { return edge_value(gr, uv); },
+                              empty_visitor{},
+                              std::less<int>{},
+                              std::plus<int>{},
+                              use_indexed_dary_heap<>{},
+                              std::allocator<std::byte>{});
+
+  for (size_t i = 0; i < clrs_dijkstra_results::distances_from_0.size(); ++i) {
+    CHECK(distance[i] == clrs_dijkstra_results::distances_from_0[i]);
+  }
+}
+
+TEST_CASE("dijkstra(indexed_heap) - arity 2 and arity 8 produce same distances",
+          "[algorithm][dijkstra][indexed_heap]") {
+  using Graph = vov_weighted;
+
+  auto g = clrs_dijkstra_graph<Graph>();
+
+  std::vector<int>                d2(num_vertices(g)), d8(num_vertices(g));
+  std::vector<vertex_id_t<Graph>> p2(num_vertices(g)), p8(num_vertices(g));
+  init_shortest_paths(g, d2, p2);
+  init_shortest_paths(g, d8, p8);
+
+  auto wt = [](const auto& gr, const auto& uv) { return edge_value(gr, uv); };
+
+  dijkstra_shortest_paths(g, vertex_id_t<Graph>(0),
+                          container_value_fn(d2), container_value_fn(p2),
+                          wt, empty_visitor{}, std::less<int>{}, std::plus<int>{},
+                          use_indexed_dary_heap<2>{}, std::allocator<std::byte>{});
+  dijkstra_shortest_paths(g, vertex_id_t<Graph>(0),
+                          container_value_fn(d8), container_value_fn(p8),
+                          wt, empty_visitor{}, std::less<int>{}, std::plus<int>{},
+                          use_indexed_dary_heap<8>{}, std::allocator<std::byte>{});
+
+  CHECK(d2 == d8);
+}
+
+// ---------------------------------------------------------------------------
+// Visitor call-count parity (Phase 2.3)
+// ---------------------------------------------------------------------------
+
+TEST_CASE("dijkstra(indexed_heap) - visitor call counts match default heap",
+          "[algorithm][dijkstra][indexed_heap][visitor]") {
+  using Graph = vov_weighted;
+
+  auto g = clrs_dijkstra_graph<Graph>();
+  auto wt = [](const auto& gr, const auto& uv) { return edge_value(gr, uv); };
+
+  CountingVisitor v_default;
+  CountingVisitor v_indexed;
+
+  std::vector<int>                d_def(num_vertices(g)), d_idx(num_vertices(g));
+  std::vector<vertex_id_t<Graph>> p_def(num_vertices(g)), p_idx(num_vertices(g));
+  init_shortest_paths(g, d_def, p_def);
+  init_shortest_paths(g, d_idx, p_idx);
+
+  dijkstra_shortest_paths(g, vertex_id_t<Graph>(0),
+                          container_value_fn(d_def), container_value_fn(p_def),
+                          wt, v_default,
+                          std::less<int>{}, std::plus<int>{},
+                          use_default_heap{}, std::allocator<std::byte>{});
+
+  dijkstra_shortest_paths(g, vertex_id_t<Graph>(0),
+                          container_value_fn(d_idx), container_value_fn(p_idx),
+                          wt, v_indexed,
+                          std::less<int>{}, std::plus<int>{},
+                          use_indexed_dary_heap<>{}, std::allocator<std::byte>{});
+
+  // Distances must agree.
+  CHECK(d_def == d_idx);
+
+  // Visitor call counts must agree exactly. Per Dijkstra invariants, every
+  // reachable vertex is examined and finished once, every outgoing edge of
+  // an examined vertex is either relaxed or not-relaxed exactly once, and
+  // discover fires once per reachable vertex.
+  CHECK(v_default.initialize  == v_indexed.initialize);
+  CHECK(v_default.discover    == v_indexed.discover);
+  CHECK(v_default.examine     == v_indexed.examine);
+  CHECK(v_default.finish      == v_indexed.finish);
+  CHECK(v_default.relaxed     == v_indexed.relaxed);
+  CHECK(v_default.not_relaxed == v_indexed.not_relaxed);
+
+  // Also assert the absolute invariant counts (5 reachable vertices in CLRS).
+  CHECK(v_indexed.examine == 5);
+  CHECK(v_indexed.finish  == 5);
+  CHECK(v_indexed.discover == 5);
+}
+
+TEST_CASE("dijkstra(indexed_heap) - visitor parity on path graph",
+          "[algorithm][dijkstra][indexed_heap][visitor]") {
+  using Graph = vov_weighted;
+
+  auto g  = path_graph_4_weighted<Graph>();
+  auto wt = [](const auto& gr, const auto& uv) { return edge_value(gr, uv); };
+
+  CountingVisitor v_default, v_indexed;
+
+  std::vector<int>                d_def(num_vertices(g)), d_idx(num_vertices(g));
+  std::vector<vertex_id_t<Graph>> p_def(num_vertices(g)), p_idx(num_vertices(g));
+  init_shortest_paths(g, d_def, p_def);
+  init_shortest_paths(g, d_idx, p_idx);
+
+  dijkstra_shortest_paths(g, vertex_id_t<Graph>(0),
+                          container_value_fn(d_def), container_value_fn(p_def),
+                          wt, v_default,
+                          std::less<int>{}, std::plus<int>{},
+                          use_default_heap{}, std::allocator<std::byte>{});
+  dijkstra_shortest_paths(g, vertex_id_t<Graph>(0),
+                          container_value_fn(d_idx), container_value_fn(p_idx),
+                          wt, v_indexed,
+                          std::less<int>{}, std::plus<int>{},
+                          use_indexed_dary_heap<>{}, std::allocator<std::byte>{});
+
+  CHECK(d_def == d_idx);
+  CHECK(v_default.discover    == v_indexed.discover);
+  CHECK(v_default.examine     == v_indexed.examine);
+  CHECK(v_default.finish      == v_indexed.finish);
+  CHECK(v_default.relaxed     == v_indexed.relaxed);
+  CHECK(v_default.not_relaxed == v_indexed.not_relaxed);
+}
+
+// ---------------------------------------------------------------------------
+// Source-out-of-range still throws on the indexed-heap path
+// ---------------------------------------------------------------------------
+
+TEST_CASE("dijkstra(indexed_heap) - throws on out-of-range source",
+          "[algorithm][dijkstra][indexed_heap]") {
+  using Graph = vov_weighted;
+
+  auto             g = clrs_dijkstra_graph<Graph>();
+  std::vector<int> distance(num_vertices(g));
+  init_shortest_paths(g, distance);
+
+  CHECK_THROWS_AS(
+        dijkstra_shortest_distances(
+              g, vertex_id_t<Graph>(num_vertices(g) + 1),
+              container_value_fn(distance),
+              [](const auto& gr, const auto& uv) { return edge_value(gr, uv); },
+              empty_visitor{}, std::less<int>{}, std::plus<int>{},
+              use_indexed_dary_heap<>{}, std::allocator<std::byte>{}),
+        std::out_of_range);
+}
+
+// ---------------------------------------------------------------------------
+// Phase 3 - mapped-container support (assoc_position_map)
+//
+// SPARSE_VERTEX_TYPES are the map / unordered_map vertex containers
+// (mov, mod, mol, uov, uod, uol). They do not satisfy index_vertex_range,
+// so the indexed-heap path must select assoc_position_map automatically.
+// ---------------------------------------------------------------------------
+
+TEMPLATE_TEST_CASE("dijkstra(indexed_heap) - sparse CLRS matches default heap",
+                   "[algorithm][dijkstra][indexed_heap][sparse]",
+                   SPARSE_VERTEX_TYPES) {
+  using Graph   = TestType;
+  using id_type = vertex_id_t<Graph>;
+  using namespace graph::test::map_fixtures;
+
+  static_assert(!adj_list::index_vertex_range<Graph>,
+                "SPARSE_VERTEX_TYPES must be mapped containers");
+
+  const auto& exp = clrs_dijkstra_sparse_expected{};
+  auto        g   = map_fixtures::clrs_dijkstra_graph<Graph>();
+
+  auto d_def = make_vertex_property_map<Graph, int>(g, infinite_distance<int>());
+  auto p_def = make_vertex_property_map<Graph, id_type>(g, id_type{});
+  auto d_idx = make_vertex_property_map<Graph, int>(g, infinite_distance<int>());
+  auto p_idx = make_vertex_property_map<Graph, id_type>(g, id_type{});
+  for (auto&& [uid, u] : views::vertexlist(g)) {
+    p_def[uid] = uid;
+    p_idx[uid] = uid;
+  }
+
+  auto wt = [](const auto& gr, const auto& uv) { return edge_value(gr, uv); };
+
+  dijkstra_shortest_paths(g, id_type(exp.s),
+                          container_value_fn(d_def), container_value_fn(p_def),
+                          wt, empty_visitor{}, std::less<int>{}, std::plus<int>{},
+                          use_default_heap{}, std::allocator<std::byte>{});
+
+  dijkstra_shortest_paths(g, id_type(exp.s),
+                          container_value_fn(d_idx), container_value_fn(p_idx),
+                          wt, empty_visitor{}, std::less<int>{}, std::plus<int>{},
+                          use_indexed_dary_heap<>{}, std::allocator<std::byte>{});
+
+  // Distances must agree with the textbook results and across heap paths.
+  for (size_t i = 0; i < exp.num_vertices; ++i) {
+    CHECK(d_idx[exp.vertex_ids[i]] == exp.distances[i]);
+    CHECK(d_idx[exp.vertex_ids[i]] == d_def[exp.vertex_ids[i]]);
+  }
+  CHECK(p_idx[exp.s] == exp.s);
+}
+
+TEMPLATE_TEST_CASE("dijkstra(indexed_heap) - sparse visitor parity",
+                   "[algorithm][dijkstra][indexed_heap][sparse][visitor]",
+                   SPARSE_VERTEX_TYPES) {
+  using Graph   = TestType;
+  using id_type = vertex_id_t<Graph>;
+  using namespace graph::test::map_fixtures;
+
+  const auto& exp = clrs_dijkstra_sparse_expected{};
+  auto        g   = map_fixtures::clrs_dijkstra_graph<Graph>();
+
+  auto d_def = make_vertex_property_map<Graph, int>(g, infinite_distance<int>());
+  auto d_idx = make_vertex_property_map<Graph, int>(g, infinite_distance<int>());
+
+  CountingVisitor v_default, v_indexed;
+  auto wt = [](const auto& gr, const auto& uv) { return edge_value(gr, uv); };
+
+  dijkstra_shortest_distances(g, id_type(exp.s), container_value_fn(d_def),
+                              wt, v_default, std::less<int>{}, std::plus<int>{},
+                              use_default_heap{}, std::allocator<std::byte>{});
+  dijkstra_shortest_distances(g, id_type(exp.s), container_value_fn(d_idx),
+                              wt, v_indexed, std::less<int>{}, std::plus<int>{},
+                              use_indexed_dary_heap<>{}, std::allocator<std::byte>{});
+
+  CHECK(v_default.discover    == v_indexed.discover);
+  CHECK(v_default.examine     == v_indexed.examine);
+  CHECK(v_default.finish      == v_indexed.finish);
+  CHECK(v_default.relaxed     == v_indexed.relaxed);
+  CHECK(v_default.not_relaxed == v_indexed.not_relaxed);
+
+  // 5 reachable vertices in the CLRS graph.
+  CHECK(v_indexed.examine == 5);
+  CHECK(v_indexed.finish  == 5);
+}
+
+// ---------------------------------------------------------------------------
+// Phase 3 - non-integral vertex IDs (std::string keys)
+//
+// Exercises the assoc_position_map path with a hashable, non-integral key
+// type. SPARSE_VERTEX_TYPES use uint32_t keys, so this test covers the
+// remaining hashable_vertex_id branch of the if constexpr dispatch.
+// ---------------------------------------------------------------------------
+
+TEST_CASE("dijkstra(indexed_heap) - string vertex IDs (CLRS topology)",
+          "[algorithm][dijkstra][indexed_heap][sparse][string_id]") {
+  using VId    = std::string;
+  using Traits = graph::container::mov_graph_traits<int, void, void, VId, false>;
+  using Graph  = graph::container::dynamic_graph<int, void, void, VId, false, Traits>;
+
+  static_assert(!adj_list::index_vertex_range<Graph>,
+                "string-keyed graph must not satisfy index_vertex_range");
+  static_assert(adj_list::hashable_vertex_id<Graph>,
+                "std::string must satisfy hashable_vertex_id");
+
+  // CLRS Figure 24.6 with string keys.
+  Graph g({{"s", "t", 10}, {"s", "y", 5},
+           {"t", "x", 1},  {"t", "y", 2},
+           {"x", "z", 4},
+           {"y", "t", 3},  {"y", "x", 9}, {"y", "z", 2},
+           {"z", "s", 7},  {"z", "x", 6}});
+
+  auto d_def = make_vertex_property_map<Graph, int>(g, infinite_distance<int>());
+  auto p_def = make_vertex_property_map<Graph, VId>(g, VId{});
+  auto d_idx = make_vertex_property_map<Graph, int>(g, infinite_distance<int>());
+  auto p_idx = make_vertex_property_map<Graph, VId>(g, VId{});
+  for (auto&& [uid, u] : views::vertexlist(g)) {
+    p_def[uid] = uid;
+    p_idx[uid] = uid;
+  }
+
+  auto wt = [](const auto& gr, const auto& uv) { return edge_value(gr, uv); };
+
+  VId source{"s"};
+  dijkstra_shortest_paths(g, source,
+                          container_value_fn(d_def), container_value_fn(p_def),
+                          wt, empty_visitor{}, std::less<int>{}, std::plus<int>{},
+                          use_default_heap{}, std::allocator<std::byte>{});
+
+  dijkstra_shortest_paths(g, source,
+                          container_value_fn(d_idx), container_value_fn(p_idx),
+                          wt, empty_visitor{}, std::less<int>{}, std::plus<int>{},
+                          use_indexed_dary_heap<>{}, std::allocator<std::byte>{});
+
+  // Textbook distances from CLRS Figure 24.6.
+  CHECK(d_idx["s"] == 0);
+  CHECK(d_idx["t"] == 8);
+  CHECK(d_idx["x"] == 9);
+  CHECK(d_idx["y"] == 5);
+  CHECK(d_idx["z"] == 7);
+
+  // Indexed heap must agree with the default heap on every vertex.
+  for (auto&& [uid, u] : views::vertexlist(g)) {
+    CHECK(d_idx[uid] == d_def[uid]);
+  }
+  CHECK(p_idx["s"] == "s");
+}
diff --git a/tests/algorithms/test_indexed_dary_heap.cpp b/tests/algorithms/test_indexed_dary_heap.cpp
new file mode 100644
index 0000000..236148d
--- /dev/null
+++ b/tests/algorithms/test_indexed_dary_heap.cpp
@@ -0,0 +1,324 @@
+/**
+ * @file test_indexed_dary_heap.cpp
+ * @brief Catch2 tests for graph::detail::indexed_dary_heap.
+ *
+ * Coverage:
+ *   - Construction, empty, size
+ *   - push / pop ordering (ascending and descending input)
+ *   - decrease-key (single and repeated)
+ *   - contains / clear
+ *   - Both arity 2 and arity 4
+ *   - Custom comparator (max-heap via std::greater)
+ *   - Both position-map adapters: vector_position_map, assoc_position_map
+ *   - Random stress (1 000 keys + 500 decrease-key ops)
+ *   - push_or_decrease convenience
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include <graph/detail/indexed_dary_heap.hpp>
+#include <graph/detail/heap_position_map.hpp>
+
+#include <functional>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+using graph::detail::indexed_dary_heap;
+using graph::detail::vector_position_map;
+using graph::detail::assoc_position_map;
+
+namespace {
+
+// Helper: drain a heap into a vector of keys, preserving pop order.
+template <class Heap>
+std::vector<typename Heap::key_type> drain(Heap& h) {
+  std::vector<typename Heap::key_type> out;
+  while (!h.empty()) {
+    out.push_back(h.top());
+    h.pop();
+  }
+  return out;
+}
+
+// Build a min-heap with vector_position_map over [0, dist.size()).
+template <std::size_t Arity = 4, class Compare = std::less<double>>
+auto make_vec_heap(std::vector<double>&      dist,
+                   std::vector<std::size_t>& pos,
+                   Compare                   cmp = {}) {
+  pos.assign(dist.size(), vector_position_map::npos);
+  auto distfn = [&dist](unsigned k) -> const double& { return dist[k]; };
+  return indexed_dary_heap<unsigned, decltype(distfn), Compare,
+                           vector_position_map, Arity>(
+        distfn, cmp, vector_position_map{pos});
+}
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// Basic construction / empty
+// ---------------------------------------------------------------------------
+
+TEST_CASE("indexed_dary_heap: empty after construction", "[heap][indexed_dary_heap]") {
+  std::vector<double>      dist;
+  std::vector<std::size_t> pos;
+  auto                     h = make_vec_heap(dist, pos);
+
+  CHECK(h.empty());
+  CHECK(h.size() == 0u);
+}
+
+// ---------------------------------------------------------------------------
+// push / pop ordering
+// ---------------------------------------------------------------------------
+
+TEST_CASE("indexed_dary_heap: pops in ascending distance order", "[heap][indexed_dary_heap]") {
+  std::vector<double>      dist = {5.0, 2.0, 7.0, 1.0, 4.0};
+  std::vector<std::size_t> pos;
+  auto                     h = make_vec_heap(dist, pos);
+
+  for (unsigned k = 0; k < dist.size(); ++k) {
+    h.push(k);
+  }
+  REQUIRE(h.size() == 5u);
+
+  // Distances: 0→5, 1→2, 2→7, 3→1, 4→4  ⇒  expected key order: 3,1,4,0,2
+  CHECK(drain(h) == std::vector<unsigned>{3, 1, 4, 0, 2});
+}
+
+TEST_CASE("indexed_dary_heap: descending input still pops ascending", "[heap][indexed_dary_heap]") {
+  std::vector<double>      dist = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  std::vector<std::size_t> pos;
+  auto                     h = make_vec_heap(dist, pos);
+
+  for (int k = 9; k >= 0; --k) {
+    h.push(static_cast<unsigned>(k));
+  }
+  CHECK(drain(h) == std::vector<unsigned>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+}
+
+TEST_CASE("indexed_dary_heap: single element", "[heap][indexed_dary_heap]") {
+  std::vector<double>      dist = {42.0};
+  std::vector<std::size_t> pos;
+  auto                     h = make_vec_heap(dist, pos);
+
+  h.push(0);
+  REQUIRE(h.size() == 1u);
+  CHECK(h.top() == 0u);
+  h.pop();
+  CHECK(h.empty());
+}
+
+// ---------------------------------------------------------------------------
+// decrease-key
+// ---------------------------------------------------------------------------
+
+TEST_CASE("indexed_dary_heap: decrease-key reorders top", "[heap][indexed_dary_heap]") {
+  std::vector<double>      dist = {10, 20, 30, 40};
+  std::vector<std::size_t> pos;
+  auto                     h = make_vec_heap(dist, pos);
+
+  for (unsigned k = 0; k < 4; ++k) h.push(k);
+  CHECK(h.top() == 0u);
+
+  // Move key 3 to the front by lowering its distance.
+  dist[3] = 1.0;
+  h.decrease(3);
+  CHECK(h.top() == 3u);
+
+  h.pop();
+  CHECK(h.top() == 0u);
+}
+
+TEST_CASE("indexed_dary_heap: repeated decrease-key on same key", "[heap][indexed_dary_heap]") {
+  std::vector<double>      dist = {100, 100, 100, 100, 100};
+  std::vector<std::size_t> pos;
+  auto                     h = make_vec_heap(dist, pos);
+
+  for (unsigned k = 0; k < 5; ++k) h.push(k);
+
+  for (double d : {50.0, 25.0, 10.0, 1.0}) {
+    dist[2] = d;
+    h.decrease(2);
+    CHECK(h.top() == 2u);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// contains / clear
+// ---------------------------------------------------------------------------
+
+TEST_CASE("indexed_dary_heap: contains tracks membership", "[heap][indexed_dary_heap]") {
+  std::vector<double>      dist = {1, 2, 3};
+  std::vector<std::size_t> pos;
+  auto                     h = make_vec_heap(dist, pos);
+
+  CHECK_FALSE(h.contains(0));
+  CHECK_FALSE(h.contains(1));
+
+  h.push(0);
+  h.push(2);
+  CHECK(h.contains(0));
+  CHECK_FALSE(h.contains(1));
+  CHECK(h.contains(2));
+
+  h.pop(); // removes 0
+  CHECK_FALSE(h.contains(0));
+  CHECK(h.contains(2));
+}
+
+TEST_CASE("indexed_dary_heap: clear empties and resets positions", "[heap][indexed_dary_heap]") {
+  std::vector<double>      dist = {1, 2, 3, 4};
+  std::vector<std::size_t> pos;
+  auto                     h = make_vec_heap(dist, pos);
+
+  for (unsigned k = 0; k < 4; ++k) h.push(k);
+  REQUIRE(h.size() == 4u);
+
+  h.clear();
+  CHECK(h.empty());
+  for (unsigned k = 0; k < 4; ++k) {
+    CHECK_FALSE(h.contains(k));
+    CHECK(pos[k] == vector_position_map::npos);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// push_or_decrease
+// ---------------------------------------------------------------------------
+
+TEST_CASE("indexed_dary_heap: push_or_decrease inserts then decreases",
+          "[heap][indexed_dary_heap]") {
+  std::vector<double>      dist = {10, 20, 30};
+  std::vector<std::size_t> pos;
+  auto                     h = make_vec_heap(dist, pos);
+
+  // First call inserts.
+  h.push_or_decrease(1);
+  CHECK(h.size() == 1u);
+  CHECK(h.top() == 1u);
+
+  h.push_or_decrease(2);
+  CHECK(h.size() == 2u);
+  CHECK(h.top() == 1u); // 20 < 30
+
+  // Lower key 2 below key 1 — second call should decrease, not duplicate.
+  dist[2] = 5.0;
+  h.push_or_decrease(2);
+  CHECK(h.size() == 2u);
+  CHECK(h.top() == 2u);
+}
+
+// ---------------------------------------------------------------------------
+// Arity 2
+// ---------------------------------------------------------------------------
+
+TEST_CASE("indexed_dary_heap: arity 2 produces sorted drain", "[heap][indexed_dary_heap]") {
+  std::vector<double>      dist = {5, 2, 7, 1, 4, 9, 3, 8, 6, 0};
+  std::vector<std::size_t> pos;
+  auto                     h = make_vec_heap<2>(dist, pos);
+
+  for (unsigned k = 0; k < dist.size(); ++k) h.push(k);
+  auto out = drain(h);
+
+  REQUIRE(out.size() == 10u);
+  for (std::size_t i = 1; i < out.size(); ++i) {
+    CHECK(dist[out[i - 1]] <= dist[out[i]]);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Custom comparator: max-heap via std::greater
+// ---------------------------------------------------------------------------
+
+TEST_CASE("indexed_dary_heap: std::greater yields max-heap", "[heap][indexed_dary_heap]") {
+  std::vector<double>      dist = {5, 2, 7, 1, 4};
+  std::vector<std::size_t> pos;
+  auto                     h    = make_vec_heap<4, std::greater<double>>(dist, pos);
+
+  for (unsigned k = 0; k < dist.size(); ++k) h.push(k);
+  // Distances: 0→5, 1→2, 2→7, 3→1, 4→4  ⇒ max-heap order: 2,0,4,1,3
+  CHECK(drain(h) == std::vector<unsigned>{2, 0, 4, 1, 3});
+}
+
+// ---------------------------------------------------------------------------
+// assoc_position_map (string keys)
+// ---------------------------------------------------------------------------
+
+TEST_CASE("indexed_dary_heap: assoc_position_map supports string keys",
+          "[heap][indexed_dary_heap][assoc_map]") {
+  std::unordered_map<std::string, double> dist = {
+        {"a", 5.0}, {"b", 2.0}, {"c", 7.0}, {"d", 1.0}};
+  std::unordered_map<std::string, std::size_t> pos;
+  auto distfn = [&dist](const std::string& k) -> const double& { return dist.at(k); };
+
+  using PMap = assoc_position_map<std::string>;
+  indexed_dary_heap<std::string, decltype(distfn), std::less<double>, PMap, 4> h(
+        distfn, std::less<double>{}, PMap{pos});
+
+  for (const auto& k : {"a", "b", "c", "d"}) h.push(k);
+  CHECK(drain(h) == std::vector<std::string>{"d", "b", "a", "c"});
+}
+
+TEST_CASE("indexed_dary_heap: assoc_position_map decrease-key", "[heap][indexed_dary_heap][assoc_map]") {
+  std::unordered_map<std::string, double> dist = {
+        {"x", 100.0}, {"y", 50.0}, {"z", 25.0}};
+  std::unordered_map<std::string, std::size_t> pos;
+  auto distfn = [&dist](const std::string& k) -> const double& { return dist.at(k); };
+
+  using PMap = assoc_position_map<std::string>;
+  indexed_dary_heap<std::string, decltype(distfn), std::less<double>, PMap, 4> h(
+        distfn, std::less<double>{}, PMap{pos});
+
+  h.push("x"); h.push("y"); h.push("z");
+  REQUIRE(h.top() == "z");
+
+  dist["x"] = 1.0;
+  h.decrease("x");
+  CHECK(h.top() == "x");
+  CHECK(h.contains("x"));
+  CHECK(h.contains("y"));
+  CHECK(h.contains("z"));
+
+  h.pop();
+  CHECK_FALSE(h.contains("x"));
+}
+
+// ---------------------------------------------------------------------------
+// Random stress: cross-check monotone drain after mixed decrease-key
+// ---------------------------------------------------------------------------
+
+TEST_CASE("indexed_dary_heap: random stress with decrease-key",
+          "[heap][indexed_dary_heap][stress]") {
+  constexpr unsigned N = 1000;
+  std::mt19937       rng(0xC0FFEE);
+
+  std::vector<double>                    dist(N);
+  std::uniform_real_distribution<double> dgen(0.0, 1000.0);
+  for (auto& d : dist) d = dgen(rng);
+
+  std::vector<std::size_t> pos;
+  auto                     h = make_vec_heap<4>(dist, pos);
+
+  for (unsigned k = 0; k < N; ++k) h.push(k);
+
+  // 500 random decrease-key ops.
+  std::uniform_int_distribution<unsigned> kpick(0, N - 1);
+  for (int i = 0; i < 500; ++i) {
+    const unsigned k = kpick(rng);
+    dist[k] *= 0.5;
+    h.decrease(k);
+  }
+
+  // Drain and assert monotone.
+  double   prev  = -1.0;
+  unsigned count = 0;
+  while (!h.empty()) {
+    const double cur = dist[h.top()];
+    CHECK(cur >= prev);
+    prev = cur;
+    h.pop();
+    ++count;
+  }
+  CHECK(count == N);
+}
diff --git a/tests/algorithms/test_mst.cpp b/tests/algorithms/test_mst.cpp
index 8589c02..004ac19 100644
--- a/tests/algorithms/test_mst.cpp
+++ b/tests/algorithms/test_mst.cpp
@@ -633,3 +633,59 @@ TEMPLATE_TEST_CASE("prim - sparse invalid seed throws",
                        container_value_fn(predecessor)),
                   std::out_of_range);
 }
+
+// =============================================================================
+// Prim's Algorithm — indexed d-ary heap parity (Phase 5)
+// =============================================================================
+
+TEST_CASE("prim - indexed d-ary heap parity", "[algorithm][mst][prim][indexed_heap]") {
+  using Graph = vov_weighted;
+  using id_t  = vertex_id_t<Graph>;
+
+  // 8-vertex weighted undirected graph that triggers post-finalization
+  // re-relaxation (the case that exposed the original Prim correctness bug).
+  // MST weight = 18 (verified by Kruskal cross-check below).
+  Graph g({{0, 1, 4}, {1, 0, 4}, {0, 2, 1}, {2, 0, 1}, {1, 2, 2}, {2, 1, 2},
+           {1, 3, 5}, {3, 1, 5}, {2, 3, 8}, {3, 2, 8}, {2, 4, 10},{4, 2, 10},
+           {3, 4, 2}, {4, 3, 2}, {3, 5, 6}, {5, 3, 6}, {4, 5, 3}, {5, 4, 3},
+           {4, 6, 9}, {6, 4, 9}, {5, 6, 7}, {6, 5, 7}, {5, 7, 1}, {7, 5, 1},
+           {6, 7, 4}, {7, 6, 4}});
+
+  const auto N = num_vertices(g);
+
+  auto run = [&](auto heap_tag) {
+    std::vector<id_t> predecessor(N);
+    std::vector<int>  weight(N);
+    init_shortest_paths(g, weight, predecessor);
+    auto total = prim(g, id_t{0},
+                      container_value_fn(weight),
+                      container_value_fn(predecessor),
+                      [](const auto& gr, const auto& uv) { return edge_value(gr, uv); },
+                      std::less<int>(),
+                      heap_tag);
+    return std::make_tuple(total, predecessor, weight);
+  };
+
+  auto [total_def,  pred_def,  wt_def ] = run(graph::use_default_heap{});
+  auto [total_idx4, pred_idx4, wt_idx4] = run(graph::use_indexed_dary_heap<4>{});
+  auto [total_idx8, pred_idx8, wt_idx8] = run(graph::use_indexed_dary_heap<8>{});
+
+  // Cross-check the absolute MST weight against Kruskal on the same edges.
+  using Edge = simple_edge<uint32_t, int>;
+  std::vector<Edge> edges = {
+      {0, 1, 4}, {0, 2, 1}, {1, 2, 2}, {1, 3, 5}, {2, 3, 8}, {2, 4, 10},
+      {3, 4, 2}, {3, 5, 6}, {4, 5, 3}, {4, 6, 9}, {5, 6, 7}, {5, 7, 1},
+      {6, 7, 4}};
+  std::vector<Edge> mst;
+  graph::kruskal(edges, mst);
+  const int kruskal_weight = total_weight(mst);
+
+  REQUIRE(kruskal_weight == 18);   // sanity
+  REQUIRE(total_def      == 18);   // Prim default heap matches Kruskal
+  REQUIRE(total_def      == total_idx4); // and matches indexed Idx4
+  REQUIRE(total_def      == total_idx8); // and matches indexed Idx8
+
+  // Per-vertex tree-edge weights must agree (predecessor may differ when ties).
+  REQUIRE(wt_def == wt_idx4);
+  REQUIRE(wt_def == wt_idx8);
+}