diff --git a/.gitignore b/.gitignore index 62dc5dc..29075b6 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,18 @@ CMakeUserPresets.json # Test outputs Testing/ *.log + +# Benchmark data (large graph files downloaded separately — see benchmark/data/README.md) +benchmark/data/*.txt +benchmark/data/*.gz + +# VTune profiler raw collections + CSV exports +build/vtune/ +vtune/ + +# Perf-tooling artifacts (bench JSON, disassembly captures, hotspot CSVs) +artifacts/ + +# Python bytecode caches +__pycache__/ +*.pyc diff --git a/CHANGELOG.md b/CHANGELOG.md index 1237f32..7b2acb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,9 @@ ## [Unreleased] ### Added +- **Indexed d-ary heap for Dijkstra** (`detail/indexed_dary_heap.hpp`) — opt-in O(V)-bounded heap with true decrease-key, parameterized by arity. Selected via the new `use_indexed_dary_heap` heap-selector tag on `dijkstra_shortest_paths` / `dijkstra_shortest_distances`. Supports both dense graphs (via `vector_position_map`) and mapped / hashable-vertex-id graphs (via `assoc_position_map`). +- `use_default_heap` and `use_indexed_dary_heap` heap-selector tags. **`use_default_heap` remains the default** — it wins on grid (E/V≈4) and path (E/V=1) workloads. Use `use_indexed_dary_heap<8>` for high-E/V random / scale-free graphs on `compressed_graph`, where Phase 4 benchmarks measured −25% (Erdős–Rényi) and −17% (Barabási–Albert) at 100K vertices vs. the default. See `agents/indexed_dary_heap_results.md` for full numbers. +- `vector_position_map` / `assoc_position_map` adapters (`detail/heap_position_map.hpp`) used by the indexed heap. - **Tarjan's SCC algorithm** (`tarjan_scc.hpp`) — single-pass O(V+E) strongly connected components using iterative DFS with low-link values; no transpose graph needed - 17 new Tarjan SCC tests (`test_tarjan_scc.cpp`) - **Mapped (sparse) graph algorithm support** — all 14 algorithms now accept `adjacency_list` (both index and map-based containers) diff --git a/CMakePresets.json b/CMakePresets.json index fd22838..fae0f9e 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -27,6 +27,10 @@ "lhs": "${hostSystemName}", "rhs": "Windows" }, + "architecture": { + "value": "x64", + "strategy": "external" + }, "cacheVariables": { "CMAKE_C_COMPILER": "cl.exe", "CMAKE_CXX_COMPILER": "cl.exe" @@ -81,18 +85,19 @@ { "name": "windows-msvc-release", "displayName": "Windows MSVC Release", - "description": "Windows development build with MSVC (Release)", + "description": "Windows development build with MSVC (Release; default flags /O2 /Ob2 /DNDEBUG)", "inherits": "windows-base", "cacheVariables": { "CMAKE_BUILD_TYPE": "Release", "BUILD_TESTS": "ON", - "BUILD_EXAMPLES": "ON" + "BUILD_EXAMPLES": "ON", + "BUILD_BENCHMARKS": "ON" } }, { "name": "windows-msvc-relwithdebinfo", "displayName": "Windows MSVC RelWithDebInfo", - "description": "Windows optimized build with debug info", + "description": "Windows optimized build with debug info (default MSVC RelWithDebInfo flags: /O2 /Ob1 /Zi /DNDEBUG)", "inherits": "windows-base", "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo", @@ -100,6 +105,18 @@ "BUILD_EXAMPLES": "ON" } }, + { + "name": "windows-msvc-profile", + "displayName": "Windows MSVC Profile (release codegen + /Ob3 + PDB)", + "description": "Investigation-only build: /O2 /Ob3 /Zi /DNDEBUG + /DEBUG linker. Maximum inlining (/Ob3) so VTune sees the inlined hot path; PDB so symbols and source attribution work. Do NOT use for production timing comparisons \u2014 use windows-msvc-release for those.", + "inherits": "windows-msvc-release", + "cacheVariables": { + "CMAKE_CXX_FLAGS_RELEASE": "/O2 /Ob3 /Zi /DNDEBUG", + "CMAKE_EXE_LINKER_FLAGS_RELEASE": "/DEBUG", + "CMAKE_SHARED_LINKER_FLAGS_RELEASE": "/DEBUG", + "DIJKSTRA_BENCH_BGL": "ON" + } + }, { "name": "windows-clang-debug", "displayName": "Windows Clang Debug", diff --git a/agents/doc_revision_plan.md b/agents/archive/doc_revision_plan.md similarity index 100% rename from agents/doc_revision_plan.md rename to agents/archive/doc_revision_plan.md diff --git a/agents/index_vertex_descriptor_plan.md b/agents/archive/index_vertex_descriptor_plan.md similarity index 100% rename from agents/index_vertex_descriptor_plan.md rename to agents/archive/index_vertex_descriptor_plan.md diff --git a/agents/map_container_plan.md b/agents/archive/map_container_plan.md similarity index 100% rename from agents/map_container_plan.md rename to agents/archive/map_container_plan.md diff --git a/agents/map_container_strategy.md b/agents/archive/map_container_strategy.md similarity index 100% rename from agents/map_container_strategy.md rename to agents/archive/map_container_strategy.md diff --git a/agents/dary_heap/csr_edge_value_perf_plan.md b/agents/dary_heap/csr_edge_value_perf_plan.md new file mode 100644 index 0000000..89a0f4e --- /dev/null +++ b/agents/dary_heap/csr_edge_value_perf_plan.md @@ -0,0 +1,578 @@ +# `compressed_graph` edge-value access path — performance investigation plan + +## Background + +Phase 4.3a (see `indexed_dary_heap_results.md`) measured graph-v3 + +`use_indexed_dary_heap` against BGL `compressed_sparse_row_graph` + +`d_ary_heap_indirect<…, 4, …>` on identical 100 K-vertex graphs: + +| Topology | Idx4 vs BGL | Idx8 vs BGL | +|---|---:|---:| +| ER Sparse | +7.7% | +5.9% | +| BA | +9.4% | +4.6% | +| Grid | +36.5% | +38.5% | +| Path | +15.0% | +14.6% | + +Open Question 6 ruled out arity (Idx4/Idx8 within 1–3 pp) and the heap (gap is +*largest* on Grid, the topology with the most predictable heap-access pattern). +Open Question 3 confirmed `compare_(distance_(...))` inlines to a single +`ucomisd` against a direct base+idx*8 load. The remaining gap therefore lives +in the **relax loop's edge-value access path** — i.e. between +`for (auto&& [vid, uv] : views::incidence(g, u))` and the load of +`edge_value(g, uv)` inside the user's `WeightFn` lambda. + +This document plans an investigation with no implementation commitments yet. + +### Side observation — MSVC vs GCC, same machine (2026-04-26) + +`agents/indexed_dary_heap_baseline_msvc.md` captures the same `benchmark_dijkstra` +suite under MSVC 19.50 (Visual Studio 18.5.1, x64 Release) on the same Titania +host. Most CSR rows agree with the GCC numbers within ±10 %, with one striking +exception: + +| Topology @ 100K | Heap | GCC ns | MSVC ns | MSVC ÷ GCC | +|---|---|---:|---:|---:| +| Path | Default | 268,708 | 1,331,743 | **4.96×** | +| Path | Idx4 | 326,018 | 498,438 | 1.53× | +| Path | Idx8 | 327,820 | 491,302 | 1.50× | + +MSVC's `std::priority_queue` codegen is **~5×** slower than libstdc++'s on the +Path workload (no decrease-key, single-source linear chain). Switching to the +indexed heap collapses the toolchain gap to ~1.5×, so the slowdown is in +MSVC's heap implementation, not in graph-v3's CPO/visitor scaffolding. + +This is **not** the gap this plan is trying to close (we're chasing the +graph-v3 vs BGL CSR gap on a single toolchain), but it is worth flagging: +any VTune profile run under this plan must be compared against MSVC numbers +for the same toolchain — never cross-compared against the Linux/GCC baseline +in `indexed_dary_heap_baseline.md`. The MSVC baseline is the anchor for +Phase 1 (Windows) below. + +--- + +## What the access path actually does today + +For `compressed_graph`: + +**Storage** (`include/graph/container/compressed_graph.hpp`) +- `row_index_` : `vector` — one entry per vertex, `.index` is offset into `col_index_`. +- `col_index_` : `vector` — one entry per edge, `.index` is `vertex_id_t` of target. +- `edge_value_` : `vector` — one entry per edge, parallel to `col_index_`, stored in a *separate buffer*. + +**Per-edge work in the Dijkstra inner loop** (`dijkstra_shortest_paths.hpp:460`): +```cpp +for (auto&& [vid, uv] : views::incidence(g, u)) { + const auto w = weight_fn(g, uv); // → edge_value(g, uv) + // → g.edge_value(uv.value() - g.col_index_.begin()) + // → edge_value_[k] + ... + relax_target(uv, uid); // reads target_id(g, uv) = uv.value()->index → col_index_[k].index +} +``` + +So the inner loop touches two parallel arrays per edge: + +| Load | Source | Cache-line cost (typical) | +|---|---|---| +| `vid` (target id, u32) | `col_index_[k].index` | 1 line per 16 edges | +| `w` (weight, f64) | `edge_value_[k]` | 1 line per 8 edges | +| `distance[vid]` (f64) | distance buffer (random) | 1 line per visit (random access) | + +BGL's `compressed_sparse_row_graph` with a bundled property and +`get(&prop::weight, g)` typically resolves to a raw `Weight*` of length +`num_edges()`. The data layout is therefore comparable — *but* BGL's +adjacency arrays are also stored in distinct buffers. The dense-graph win +suggests the per-edge work, not the layout, is what differs. + +Other suspects to investigate, ordered by prior probability: + +1. **Iterator descriptor materialisation.** `views::incidence` yields a + structured `[vid, uv]` pair where `uv` is a full `edge_descriptor` + carrying the source vertex (for `target_id` symmetry), an iterator into + `col_index_`, possibly a graph back-reference. The relax loop only needs + `target_id` (already in `vid`) and `edge_value`; `uv` is also re-passed to + `weight_fn`, where `edge_value(g, uv)` recomputes + `uv.value() - g.col_index_.begin()` — a pointer subtraction the iterator + itself already has implicit (it *is* the iterator). +2. **Redundant pointer subtraction per edge.** `edge_value` resolves the + edge index via `uv.value() - g.col_index_.begin()`. In a tight loop this + is one extra subtraction per edge that BGL's `weight_map[edge_descriptor]` + may avoid (BGL's CSR `edge_descriptor` carries the index directly). +3. **`basic_incidence` not used.** `views::incidence` builds a full edge + descriptor; `views::basic_incidence` (also documented in + `views/incidence.hpp`) yields just `[tid]` and is documented as "lighter + still: never materialises an edge descriptor". The Dijkstra relax loop + could use a CSR-aware fast path that yields `(tid, edge_index)` and reads + `edge_value_[edge_index]` directly — but that breaks the visitor + contract (`on_examine_edge`, `on_edge_relaxed`, `on_edge_not_relaxed` + take `const edge_t&`). +4. **No prefetching.** BGL doesn't prefetch either, so this is not the gap + on its own — but if (1)/(2) are the cause, prefetching `edge_value_[k+P]` + at iterator increment is a free additional win. +5. **`auto&&` destructuring vs raw indexed loop.** `for (auto&& [vid, uv] : + views::incidence(g, u))` involves a range adapter, an iterator type, and + structured binding. GCC usually collapses this; verifying it does + (vs BGL's raw `csr_edge_iterator` over a `pair`) takes 5 minutes + with `objdump`. +6. **Cache-line alignment.** `col_index_` and `edge_value_` are + `std::vector`s; both start at 64-byte-aligned addresses by default + (`std::allocator`). Unlikely to be the issue but cheap to confirm. + +--- + +## Investigation phases + +### Phase 1 — Reproduce and quantify (no code changes) + +Goal: confirm the gap is reproducible at smaller `n`, isolate the loop. + +| Item | Detail | +|------|--------| +| **1.1 Re-run baseline** | `benchmark_dijkstra` Idx4 vs BGL CSR at n = 10K, 30K, 100K, 300K for ER, BA, Grid, Path (smaller sizes fit in L2; larger expose memory subsystem). 5 runs each, drop high/low, report median. | +| **1.2 Hardware counters** | `perf stat -e cycles,instructions,L1-dcache-load-misses,LLC-load-misses,branch-misses,branch-instructions ./benchmark_dijkstra --benchmark_filter=...` for `BM_Dijkstra_CSR_Grid_Idx4/100000` and `BM_Dijkstra_BGL_CSR_Grid/100000`. Tabulate IPC, cycles/edge, loads/edge, miss rates. | +| **1.3 perf record + annotate** | Single-run profile with `perf record -F 4000 --call-graph=lbr` of each, `perf annotate` on the inlined relax loop. Compare instruction mix and identify the cycle-eating instructions. | +| **1.4 Verdict** | If counters show graph-v3 has materially more **instructions/edge** with similar miss rates → suspect 1, 2, 5 (work, not memory). If similar instructions/edge but more **L1/LLC misses/edge** → suspect layout / prefetch. | + +Output: a small results table appended to `indexed_dary_heap_results.md` § +"Phase 4.3b — CSR access-path profiling". + +### Phase 2 — Disassembly comparison (no code changes) + +Goal: verify the inlining hypothesis from Open Q3 still holds for the +*entire* relax body, not just the heap, and quantify per-edge instruction +count. + +| Item | Detail | +|------|--------| +| **2.1 Locate relax loop** | `nm --demangle benchmark_dijkstra` → find the `dijkstra_shortest_paths<...CSR..., Idx4>::run::operator()<...>` constprop symbol. `objdump -d --no-show-raw-insn` over its byte range. Identify the `for (auto&&[vid, uv] : views::incidence(g, u))` body by structure (back-edge into the inner loop, distance load, ucomisd, conditional jump). | +| **2.2 Per-edge instruction count** | Count `mov`/`add`/`sub`/`cmp`/`j*` between the inner-loop top and back-edge. Repeat for BGL's compiled `csr_*` Dijkstra. Diff. | +| **2.3 Check for redundant work** | Specifically look for: (a) two separate `(%base,%idx,8)` loads from distinct base registers (= `col_index_[k].index` and `edge_value_[k]`); (b) `lea`/`sub` sequences computing `uv.value() - col_index_.begin()` per edge; (c) any `call` instructions (should be zero per Q3). | +| **2.4 Verdict** | Concrete count of "extra instructions per edge in graph-v3 vs BGL". | + +### Phase 3 — Microbenchmark the descriptor cost + +Goal: isolate whether the `edge_descriptor` materialisation in +`views::incidence` is the cost, vs the `edge_value_` load itself. + +| Item | Detail | +|------|--------| +| **3.1 Hand-rolled raw loop benchmark** | New benchmark `BM_Dijkstra_CSR_*_Raw` that bypasses `views::incidence` and reads `g.col_index_[k].index` and `g.edge_value_[k]` directly via the CSR row range, but otherwise uses the same `dijkstra_shortest_paths` heap+visitor scaffolding. (Probably needs a small Dijkstra variant in the benchmark file, not in the public algorithm.) | +| **3.2 Compare** | Raw vs Idx4 vs BGL. If Raw closes most of the gap → confirms (1)/(2)/(5). If Raw still trails BGL → the gap is in the heap+distance-buffer access, not the edge access. | +| **3.3 Verdict** | Quantifies how much the descriptor abstraction costs in % terms. | + +### Phase 4 — Decide on a fix + +Driven entirely by Phase 1–3 findings. Candidate interventions, *in order +of preference*: + +1. **`edge_value` overload that takes the edge offset directly.** If `uv` + already carries the offset (or could be made to), eliminate the + `uv.value() - col_index_.begin()` subtraction. Possibly a `compressed_graph`- + specific friend overload of `edge_value(g, uv)` that uses an internal + stored offset. Zero ABI impact on other graph types. +2. **`incidence` fast-path on `compressed_graph`.** A specialisation that + precomputes `(tid, edge_index)` per step and caches the offset, so + downstream `edge_value(g, uv)` is a single indexed load. Has to preserve + the `edge_t` exposed to visitors, so the descriptor is still + constructible on demand. +3. **Algorithm-internal raw path on `compressed_graph`.** A Dijkstra + `if constexpr` branch for `is_compressed_graph` that walks `row_index_` + / `col_index_` / `edge_value_` directly. Largest perf win, biggest + maintenance cost (a second algorithm body), and skips the visitor edge + events. Only justified if (1) and (2) are insufficient. +4. **Software prefetch.** `__builtin_prefetch(&col_index_[k+P])` and + `&edge_value_[k+P]` inside the per-edge loop. Free perf if the bottleneck + is memory not work; harmful if the bottleneck is work. +5. **Layout change** (interleave target id and weight in one struct of + size `sizeof(VId) + sizeof(EV)`). Big change for uncertain win — defer. + +Each candidate gets its own commit and benchmark delta, judged on the same +ER/BA/Grid/Path/100K table. + +### Phase 5 — Document and decide default + +| Item | Detail | +|------|--------| +| **5.1 Update results doc** | Final numbers go in `indexed_dary_heap_results.md` § "Phase 4.3b". | +| **5.2 Update plan doc** | Resolve Open Q6's "out of scope" caveat. | +| **5.3 Decide default heap** | If the fix shifts CSR + default-heap below CSR + Idx8 on dense graphs, revisit the Phase 4.2 default-heap recommendation. | + +--- + +## Acceptance criteria + +- A single-page §4.3b in `indexed_dary_heap_results.md` with the n=100K table + rerun after each intervention. +- A clear `perf stat` / `objdump` artifact identifying the *instruction-level* + cause of the gap (not just "the loop is slower"). +- A go/no-go decision on each of the 5 candidate interventions. +- All 4848 ctest tests still pass after any landed change. + +## Out of scope + +- Changing public algorithm signatures. +- Changing `compressed_graph`'s public storage layout (`row_index_`, + `col_index_`, `edge_value_` member access stays as-is for users with + external code that touches them). +- Heap changes (settled in Phase 4.3a). +- Prim — already inherits any Dijkstra perf win via the Phase 5 + Option 1 wrapper. + +## Risk + +- The investigation may show the gap is in `std::vector`-style + layout work that we *can't* close without a new container, in which case + the correct outcome is documenting the residual gap and stopping. +- Prefetch tuning is fragile and machine-specific; if it lands it must be + benchmarked on at least two different µarch generations before being + enabled by default. + +--- + +## Phase 1.1 — Reproduce on Windows MSVC (`windows-msvc-profile`, 2026-04-27) + +**Build:** `windows-msvc-profile` preset (`/O2 /Ob3 /Zi /DNDEBUG`, `/DEBUG` +linker, `DIJKSTRA_BENCH_BGL=ON`, BGL at `D:/dev_graph/boost`). +**Methodology:** core 0 pinning, priority `High`, 5 reps, median, 2 s min +benchmark time. Same machine (Titania) as the Linux baseline. + +### Results — graph-v3 Idx4 vs BGL CSR @ n = 100K + +| Topology | graph-v3 Idx4 (ns) | BGL CSR (ns) | graph-v3 vs BGL | +|------------|-------------------:|-------------:|----------------:| +| ER Sparse | 20,147,385 | 32,849,012 | **−38.7 %** ✅ | +| Grid | 7,203,305 | 10,927,450 | **−34.1 %** ✅ | +| BA | 20,446,214 | 32,326,378 | **−36.7 %** ✅ | +| Path | 394,793 | 1,101,341 | **−64.1 %** ✅ | + +CV ≤ 5 % on every row except Path/Idx4 (10.3 % — single noisy run; absolute +delta is well outside any plausible CV band). + +### Comparison with the original Phase 4.3a baseline (Linux GCC, 2025) + +| Topology | Phase 4.3a Idx4 vs BGL (Linux GCC) | Phase 1.1 Idx4 vs BGL (Windows MSVC, today) | +|------------|-----------------------------------:|--------------------------------------------:| +| ER Sparse | **+7.7 %** (graph-v3 slower) | **−38.7 %** (graph-v3 faster) | +| Grid | **+36.5 %** (graph-v3 slower) | **−34.1 %** (graph-v3 faster) | +| BA | **+9.4 %** (graph-v3 slower) | **−36.7 %** (graph-v3 faster) | +| Path | **+15.0 %** (graph-v3 slower) | **−64.1 %** (graph-v3 faster) | + +### Interpretation + +The motivating gap (graph-v3 7–37 % slower than BGL on Linux GCC) does **not +reproduce on Windows MSVC**: under MSVC `/O2 /Ob3` graph-v3 is 34–64 % +*faster* than BGL on every topology. Two non-exclusive explanations: + +1. **Toolchain-dependent codegen.** GCC may inline BGL's `get(weight, g)` + property-map machinery (heavy template specialization on tag dispatch) + more aggressively than MSVC, while MSVC at `/Ob3` collapses graph-v3's + `views::incidence` + `edge_value(g, uv)` chain — the exact path Phase + 4.3e proved is now fully inlined under MSVC profile flags. +2. **Code drift since 4.3a.** The `indexed-dary-heap` branch contains + significant work to the access path since 4.3a was captured: + - `5085c60` Edge desc (#23) + - `7645a19` Simplify traversal_common.hpp by unifying property function concepts (#22) + - `1c871a8` Phase 2: Add basic_incidence; refactor incidence uid overloads + - `aa95fe0` feat: add target_id to incidence_view return type + These specifically reduce the cost of the `views::incidence` + + `edge_value` chain that the perf plan identified as the suspect. + +### Decision + +The plan's premise (graph-v3 slower than BGL on CSR) is **not currently +reproducible on this toolchain**. Three branches of follow-up work, in +priority order: + +| Priority | Action | Rationale | +|----------|--------|-----------| +| **High** | Re-run Phase 4.3a / Phase 1.1 under Linux GCC on the same host | The original gap was a Linux-GCC-only phenomenon. Confirm whether the recent code drift (5085c60, 7645a19, 1c871a8, aa95fe0) closed it under GCC too. If yes → plan complete. If no → original investigation (Phases 1.2–4) still has work. | +| Medium | Cross-compare BGL itself: GCC vs MSVC on the same machine | If BGL gets dramatically faster under GCC than MSVC (and graph-v3 is roughly toolchain-neutral), the "gap" was always a BGL property-map advantage on GCC, not a graph-v3 deficit. | +| **Now** | Phase 2 disassembly on MSVC (next section) | Even though the gap is reversed, the plan's original Phase 2 instrumentation still tells us *why* graph-v3 wins on MSVC. Cheap with the profile preset's PDB. | + +--- + +## Phase 2 — MSVC disassembly of `sift_down_` and the relax loop (2026-04-27) + +**Tooling:** `scripts/perf/disasm_func.py` (new this session) targets a single +function by demangled-name substring instead of dumping the full 14k+ entries +of the exe. + +### VTune anchor on the profile preset + +``` +heap::sift_down_ 34.9 % +less::operator() 8.7 % (1st copy) +cfn::operator() 6.7 % +incidence_view::iterator::operator* 5.9 % +vector::operator[] 4.8 % +less::operator() 4.1 % (2nd copy) +dijkstra ... ::operator() 4.0 % +cfn::operator() 2.3 % (2nd copy) +heap::sift_up_ 1.7 % +``` + +Symbol attribution differs from Phase 4.3e (where 98.8 % collapsed into one +anonymous frame): `/Zi` keeps function boundaries visible to the linker even +when the bodies are inlined, so VTune can attribute samples to source-line +owners. The 98.8 % number was a **symbol-stripping artefact**, not an actual +codegen difference. Codegen at `/O2 /Ob3` and `/O2 /Ob3 /Zi` are the same; +only attribution differs. + +### `sift_down_` (Idx4) inner child-scan loop + +``` +artifacts/perf/sift_down_idx4.asm (Idx4, RVA 0x14006bbb0) + +LOOP_BODY (one comparison per child, 4 unrolled per outer step): + mov eax, [r11 + r8*4] ; load best-so-far child key + mov ecx, [r11 + r9*4] ; load other child key + movsd xmm0, [r10 + rax*8] ; load best distance + comisd xmm0, [r10 + rcx*8] ; compare against other distance + cmova r8, r9 ; if a < best → r8 := r9 +``` + +**Per-comparison cost: 5 instructions, 2 indexed loads, 1 `comisd`, 1 +conditional move.** No call instructions, no template scaffolding, no +pointer subtractions, no `std::less`/`container_value_fn` thunks visible in +the body — they have all been collapsed by `/Ob3`. This is the textbook +shape Open Question 3 hypothesised would happen; on MSVC it required +`/Ob3` to materialise (Phase 4.3d/e showed `/Ob2` was insufficient). + +The outer loop unrolls 4 children per iteration (Arity = 4) using the same +5-instruction template, then falls into a 1-child remainder loop +(`0x14006bcba`–`0x14006bcd8`, identical shape). Loop-carried dependencies +are limited to `r8` (best-index) and the loop counter `r9`. + +### What this tells us about the BGL "gap" + +The Phase 1.1 numbers showed graph-v3 −34 % to −64 % vs BGL on every +topology under MSVC. The disassembly confirms this is **real codegen**, not +a measurement artefact: + +- `sift_down_` is genuinely tight (5 insn / comparison, fully inlined + comparator). +- The relax-loop attribution (`incidence_view::iterator::operator*`, + `vector::operator[]`, the dijkstra lambda) totals ~15 % — a + reasonable fraction for the per-edge work. + +The remaining MSVC investigation work would be confirming that BGL's +`get(weight, g)` compiles to a comparable shape on MSVC (it likely *doesn't*, +which would explain the 35–65 % graph-v3 win). That is parked pending the +Linux GCC rerun (the only place the original gap lived). + +### Acceptance for Thread B (MSVC scope) + +- ✅ Phase 1.1 reruns the BGL comparison; the gap inverted to a graph-v3 + win of 34–64 %. +- ✅ Phase 2 disassembly proves the win is real codegen and identifies the + exact instruction shape. +- ⏸ Phase 3 (raw-loop microbenchmark) — **deferred**: there is no gap to + explain on MSVC, so a "what fraction of the gap is descriptor cost" + experiment has nothing to measure. +- ⏸ Phases 4–5 (interventions, default-heap revisit) — **deferred** until + Linux GCC reproduces or refutes the original gap. + +### Files captured this phase + +``` +artifacts/perf/hot_001.csv VTune CSV export (profile build) +artifacts/perf/sift_down_first.asm Idx2 sift_down_ (RVA 0x14006b9b0) +artifacts/perf/sift_down_idx4.asm Idx4 sift_down_ (RVA 0x14006bbb0) +``` + + + +--- + +## Phase 1.1 — Reproduce on Linux GCC (`linux-gcc-release`, 2026-04-28) + +Re-run on Linux/WSL with GCC under `linux-gcc-release` on the same +`indexed-dary-heap` HEAD as the MSVC capture, per +`agents/thread_b_linux_runbook.md`. Capture artifacts live under +`artifacts/perf/linux_gcc/` (gitignored, regenerable via +`bash scripts/perf/linux_gcc_capture.sh`). + +### Setup + +- Toolchain: `g++` 13.x, `-O3` via `linux-gcc-release` preset. +- BGL: `-DDIJKSTRA_BENCH_BGL=ON -DBGL_INCLUDE_DIR=/home/phil/dev_graph/boost`. +- Pinning: `taskset -c 4`, 5 reps, median, `--benchmark_min_time=2s`. +- Software perf events only (WSL has no PMU exposed; the script + attempts `task-clock,context-switches,...,instructions:u,cycles:u` but + those return non-zero on this host — captured for completeness, not + used for analysis). + +### graph-v3 vs BGL CSR (the central question) + +Indexed d-ary heap (Idx4, the per-arity comparison target): + +| Topology | N | graph-v3 CSR Idx4 (ns) | BGL CSR (ns) | Δ % vs BGL | CV v3 % | CV BGL % | +|------------|---------:|-----------------------:|----------------:|-----------:|--------:|---------:| +| ER_Sparse | 10,000 | 1,131,399 | 927,971 | **+21.9 ⚠** | 1.5 | 2.4 | +| ER_Sparse | 100,000 | 22,472,835 | 19,586,652 | **+14.7 ⚠** | 4.8 | 10.0 | +| Grid | 10,000 | 594,732 | 425,172 | **+39.9 ⚠** | 0.8 | 0.8 | +| Grid | 100,000 | 8,007,736 | 5,877,749 | **+36.2 ⚠** | 1.5 | 1.9 | +| BA | 10,000 | 1,099,949 | 925,914 | **+18.8 ⚠** | 2.5 | 0.4 | +| BA | 100,000 | 19,791,772 | 18,669,163 | +6.0 ⚠ | 1.4 | 0.7 | +| Path | 10,000 | 30,905 | 26,723 | **+15.6 ⚠** | 1.9 | 0.5 | +| Path | 100,000 | 313,622 | 272,206 | **+15.2 ⚠** | 2.3 | 0.4 | + +Default heap (binary, no Idx tag), for completeness: + +| Topology | N | graph-v3 CSR (ns) | BGL CSR (ns) | Δ % vs BGL | +|------------|---------:|------------------:|----------------:|-----------:| +| ER_Sparse | 10,000 | 1,225,762 | 927,971 | +32.1 ⚠ | +| ER_Sparse | 100,000 | 23,173,028 | 19,586,652 | +18.3 ⚠ | +| Grid | 10,000 | 478,675 | 425,172 | +12.6 ⚠ | +| Grid | 100,000 | 6,251,103 | 5,877,749 | +6.4 ⚠ | +| BA | 10,000 | 1,199,459 | 925,914 | +29.5 ⚠ | +| BA | 100,000 | 22,868,714 | 18,669,163 | +22.5 ⚠ | +| Path | 10,000 | 26,137 | 26,723 | −2.2 | +| Path | 100,000 | 261,003 | 272,206 | −4.1 | + +CVs are uniformly ≤ 5 % — these are real differences, not noise. + +### Comparison with the original Phase 4.3a baseline (Linux GCC, 2025) + +| Topology | Phase 4.3a Idx4 vs BGL (Linux GCC, 2025) | Phase 1.1 Linux Idx4 vs BGL (today) | +|------------|------------------------------------------|-------------------------------------| +| ER_Sparse | +7.7 % (slower) | **+14.7 % to +21.9 %** (slower) | +| Grid | +36.5 % (slower) | **+36.2 % to +39.9 %** (slower) | +| BA | +9.4 % | +6.0 % to +18.8 % | +| Path | +15.0 % | +15.2 % to +15.6 % | + +### Cross-toolchain (MSVC vs GCC) + +The same code, same machine class, two compilers — illustrative deltas +(median, `bench_compare.py --label-baseline msvc --label-candidate gcc`): + +| Benchmark | MSVC (ns) | GCC (ns) | Δ % | +|------------------------------------------|----------:|---------:|-------:| +| BM_Dijkstra_BGL_CSR_Grid/100000 | 10.49 M | 5.88 M | −44.0 ✅ | +| BM_Dijkstra_CSR_Grid_Idx4/100000 | 6.91 M | 8.01 M | +15.8 ⚠ | +| BM_Dijkstra_BGL_CSR_Path/100000 | 1.09 M | 0.27 M | −74.9 ✅ | +| BM_Dijkstra_CSR_Path_Idx4/100000 | 0.39 M | 0.31 M | −19.1 ✅ | +| BM_Dijkstra_BGL_CSR_ER_Sparse/100000 | 33.34 M | 19.59 M | −41.3 ✅ | +| BM_Dijkstra_CSR_ER_Sparse_Idx4/100000 | 19.98 M | 22.47 M | +12.5 ⚠ | + +Full table: `artifacts/perf/linux_gcc/diff_msvc_vs_gcc.md`. + +GCC compresses BGL's CSR Dijkstra body ~40-75 % vs MSVC. graph-v3 +gets *slower* on GCC for `Idx4` (+8 % to +16 %) — the opposite direction +— which is why the BGL gap that vanishes on MSVC re-emerges on GCC. + +### Decision-tree verdict (from `thread_b_linux_runbook.md`) + +> graph-v3 still +30 %+ slower on Grid (the original 4.3a worst case) → +> Original investigation premise is intact; proceed with edge-value +> access path investigation as planned. + +**Verdict: NO — the Phase 4.3a graph-v3-vs-BGL gap is *not* closed on +Linux GCC at HEAD.** The post-4.3a commits (`5085c60`, `7645a19`, +`1c871a8`, `aa95fe0`) closed the gap on MSVC (where graph-v3 now wins +−34 % to −64 %) but did not close it on GCC. Grid-100K is essentially +unchanged from the 2025 baseline (+36.5 % → +36.2 %). + +Phases 3–5 of this plan (raw-loop micro, descriptor-cost intervention, +default-heap revisit) are **un-deferred**. + +--- + +## Phase 2 — Linux GCC disassembly comparison (2026-04-28) + +`scripts/perf/linux_gcc_capture.sh` drives `objdump --demangle` over a +manifest at `agents/perf_capture_manifest_linux.txt`. Captures land in +`artifacts/perf/linux_gcc/*.asm`. + +### What GCC actually emits + +GCC inlines aggressively enough that several MSVC-side capture targets +have **no standalone body** at all: + +| Target | MSVC body? | GCC body? | +|------------------------------------------|-----------:|----------:| +| `sift_down_` (graph-v3, all arities) | yes (~185 lines) | **no — fully inlined into the dijkstra closure** | +| `sift_up_` (graph-v3) | yes (109 lines) | **no — fully inlined** | +| `preserve_heap_property_down` (BGL) | yes (299 lines) | **no — fully inlined** | +| `preserve_heap_property_up` (BGL) | yes (204 lines) | **no — fully inlined** | +| `container_value_fn::operator()` | yes (85 lines) | **no — inlined into the dijkstra closure** | + +GCC instead exposes the dijkstra body as an inner closure +`{lambda(auto:1&)#1}::operator()>`. That closure +is the apples-to-apples unit vs BGL's +`graph::benchmark::run_bgl_dijkstra<...>` (which itself absorbs all of +BGL's `dijkstra_shortest_paths_no_color_map_no_init`). + +### Per-symbol size comparison (objdump line counts) + +``` +symbol msvc-lines gcc-lines +bgl_dary_sift_down_csr 299 (inlined) +bgl_dary_sift_up_csr 204 (inlined) +container_value_fn 85 (inlined) +dijkstra_bgl_csr 505 412 +dijkstra_csr_idx2 206 361 +dijkstra_csr_idx4 206 387 +dijkstra_csr_idx8 206 382 +sift_down_csr_idx2 186 (inlined) +sift_down_csr_idx4 184 (inlined) +sift_down_csr_idx8 186 (inlined) +sift_down_vov_idx4 191 (inlined) +sift_up_csr_idx4 109 (inlined) +dijkstra_vov_idx4 NA 465 +dijkstra_bgl_adj NA 424 +``` + +Apples-to-apples (full inlined dijkstra body, line counts): + +| Body | MSVC (sum: dijkstra + sift_down + sift_up) | GCC (single body) | +|-----------------|-------------------------------------------:|------------------:| +| graph-v3 Idx4 | 206 + 184 + 109 = **499** | **387** | +| graph-v3 Idx2 | 206 + 186 + 109 = **501** | **361** | +| graph-v3 Idx8 | 206 + 186 + 109 = **501** | **382** | +| BGL CSR | 505 + 299 + 204 = **1,008** | **412** | + +### What this tells us about the GCC gap + +The size signal inverts on GCC: + +- On MSVC, graph-v3's *fully inlined* body is roughly half BGL's + (~500 vs ~1,000 lines) — matching the −34 % to −64 % wall-clock win. +- On GCC, BGL collapses to **412 lines** (a 2.4× reduction from MSVC), + while graph-v3 only collapses to ~380 (1.3× reduction). + The two bodies are now **comparably sized**, and graph-v3 is the one + that's slower (+15 % to +40 %) — same direction the size delta + predicts. + +In other words, GCC is much more aggressive at compressing BGL's +`get(weight_map, edge)` + `dijkstra_shortest_paths_no_color_map_no_init` +chain than MSVC is, while graph-v3's per-edge work +(`incidence_iterator::operator*` → `target_id` / `edge_value` → relax) +doesn't get the same treatment from GCC. + +This is exactly the codegen hypothesis Phase 4.3a articulated, and it +matches the Phase 3–5 intervention plan in this document: the next +investigation step is the **raw-loop microbenchmark** (Phase 3) to +measure how much of the +15 % to +40 % is attributable to the +descriptor / value-access path itself vs heap administration. + +### Files captured this phase + +``` +artifacts/perf/linux_gcc/wallclock_baseline.json 96 rows, 5 reps median +artifacts/perf/linux_gcc/diff_msvc_vs_gcc.md cross-toolchain table +artifacts/perf/linux_gcc/perfstat_*.{stdout,stderr} software events (PMU N/A on WSL) +artifacts/perf/linux_gcc/dijkstra_csr_idx{2,4,8}.asm graph-v3 inlined dijkstra body +artifacts/perf/linux_gcc/dijkstra_vov_idx4.asm graph-v3 VoV control +artifacts/perf/linux_gcc/dijkstra_bgl_csr.asm BGL CSR inlined dijkstra body +artifacts/perf/linux_gcc/dijkstra_bgl_adj.asm BGL adj_list inlined dijkstra body +``` + +### Acceptance for Thread B (Linux GCC scope) + +- ✅ Phase 1.1 Linux GCC reruns confirm the original 4.3a gap is intact. +- ✅ Phase 2 Linux GCC disassembly localises the gap to BGL's + ~2.4× more aggressive inlining vs graph-v3's ~1.3×. +- ▶ Phases 3–5 are **active again** — see + `agents/perf_linux_gcc_inventory.md` for the regeneration recipe + and per-symbol manifest details. diff --git a/agents/dary_heap/findings_summary.md b/agents/dary_heap/findings_summary.md new file mode 100644 index 0000000..b2dc76f --- /dev/null +++ b/agents/dary_heap/findings_summary.md @@ -0,0 +1,195 @@ +# Dijkstra `Heap` Template Parameter — Findings Summary + +**Status:** Final (closes Phase 4 of `indexed_dary_heap_plan.md`) +**Period:** 2026-04-25 → 2026-04-27 +**Branch:** `indexed-dary-heap` +**Scope:** Performance evaluation of `dijkstra_shortest_paths` after the +`Heap` template parameter was added, comparing graph-v3 against Boost.Graph +(BGL) on Linux (WSL2 / GCC) and Windows (MSVC), on identical hardware. + +This document is the consolidated reference for the heap-selector decision. +For the raw run logs, baselines, and per-phase analysis see: + +- [indexed_dary_heap_plan.md](indexed_dary_heap_plan.md) — phased work plan +- [indexed_dary_heap_baseline.md](indexed_dary_heap_baseline.md) — Linux/GCC Phase 0 baseline +- [indexed_dary_heap_baseline_msvc.md](indexed_dary_heap_baseline_msvc.md) — Windows/MSVC baseline + `/Ob3` +- [indexed_dary_heap_results.md](indexed_dary_heap_results.md) — Phase 4.1–4.3e detailed results +- [csr_edge_value_perf_plan.md](csr_edge_value_perf_plan.md) — follow-on CSR-access investigation + +--- + +## 1. What was added + +A new `Heap` template parameter on `dijkstra_shortest_paths` and +`dijkstra_shortest_distances`, selecting one of two heap implementations +via tag dispatch: + +| Tag | Implementation | +|-----|----------------| +| `use_default_heap` (default) | `std::priority_queue` with lazy deletion. Heap may grow to O(E); stale entries skipped at pop. | +| `use_indexed_dary_heap` | Indexed d-ary heap with true decrease-key. Heap size bounded by O(V); no stale pops. Position map auto-selected: `vector_position_map` for `index_vertex_range`, `assoc_position_map` (unordered_map) otherwise. | + +Both branches preserve identical visitor semantics +(`on_examine_vertex` / `on_finish_vertex` fire exactly once per reachable +vertex; `on_edge_relaxed` / `on_edge_not_relaxed` fire exactly once per +outgoing edge of every examined vertex). + +--- + +## 2. Test matrix + +Same machine (Titania, 20×3.61 GHz; 48 KiB L1-D, 1.28 MiB L2, 25 MiB L3) +under both toolchains. + +| Axis | Values | +|------|--------| +| OS / toolchain | Linux WSL2 + GCC (Phase 0/4.1), Windows + MSVC 19.50 (Phase 4.3b–e) | +| Container | `compressed_graph` (CSR), `dynamic_graph` (VoV) | +| Topology | Erdős–Rényi sparse (E/V ≈ 8), Barabási–Albert m=4 (E/V ≈ 8), 2D grid (E/V ≈ 4), path (E/V = 1) | +| Size | 1 K, 10 K, 100 K vertices | +| Heap | `Default`, `Idx2`, `Idx4`, `Idx8` | +| Reference | Boost.Graph `dijkstra_shortest_paths_no_color_map_no_init` on `compressed_sparse_row_graph` and on `adjacency_list` | + +Distance-vector parity vs BGL is asserted at startup +(`check_bgl_distance_parity` in `bgl_dijkstra_fixtures.hpp`) for ER, BA, and +Path at n = 1024. + +--- + +## 3. Headline performance results (CSR, n = 100 000) + +### Linux / GCC (Phase 4.1, mean of 3 runs) + +| Topology | E/V | Default | Idx2 | Idx4 | Idx8 | Best vs Default | +|----------|----:|--------:|-----:|-----:|-----:|:---------------:| +| ER Sparse | 8 | 27.0 ms | 24.2 ms | 25.8 ms | **20.2 ms** | **−25 %** | +| BA m=4 | 8 | 22.9 ms | 22.1 ms | 20.0 ms | **19.0 ms** | **−17 %** | +| Grid | 4 | **6.0 ms** | 6.7 ms | 8.2 ms | 8.4 ms | indexed +39 % regression | +| Path | 1 | **0.27 ms** | 0.33 ms | 0.33 ms | 0.33 ms | indexed +22 % regression | + +### Windows / MSVC (`/O2 /Ob2`, median of 5 reps) + +| Topology | E/V | Default | Idx2 | Idx4 | Idx8 | Best vs Default | +|----------|----:|--------:|-----:|-----:|-----:|:---------------:| +| ER Sparse | 8 | 26.7 ms | 26.4 ms | **21.1 ms** | 22.2 ms | **−21 %** | +| BA m=4 | 8 | 25.3 ms | 25.4 ms | **19.6 ms** | 22.2 ms | **−22 %** | +| Grid | 4 | **6.2 ms** | 6.9 ms | 6.9 ms | 8.2 ms | indexed +11 % regression | +| Path | 1 | 1.33 ms | 0.49 ms | 0.50 ms | **0.49 ms** | **−63 %** ✅ | + +### vs Boost.Graph on the same graphs (Linux/GCC, n = 100 K) + +| Topology | graph-v3 Default | graph-v3 Idx8 | BGL CSR | BGL `adjacency_list` | +|----------|-----------------:|---------------:|--------:|---------------------:| +| ER Sparse | 26.2 ms | **22.9 ms** | **19.9 ms** | 34.2 ms | +| BA m=4 | 26.9 ms | **21.7 ms** | **19.6 ms** | 30.9 ms | +| Grid | **6.2 ms** | 8.9 ms | **6.1 ms** | 9.9 ms | +| Path | **0.27 ms** | 0.33 ms | 0.28 ms | 0.52 ms | + +graph-v3 with the default heap **beats `boost::adjacency_list` on every +topology** (23–48 % faster). Against `boost::compressed_sparse_row_graph` +(BGL's optimised CSR with a native 4-ary indexed heap), `Idx8` closes most +of the gap on dense workloads (within ~5–10 %); the residual gap is in +graph-v3's edge-value access path on CSR, not in the heap (Phase 4.3a/b +verified — see [csr_edge_value_perf_plan.md](csr_edge_value_perf_plan.md)). + +--- + +## 4. Cross-topology summary + +| Topology | E/V | Best heap (GCC) | Best heap (MSVC) | Notes | +|----------|----:|:---------------:|:----------------:|-------| +| ER Sparse | 8 | **Idx8** | **Idx4** (Idx8 close) | Decrease-key wins on dense random graphs. | +| BA m=4 | 8 | **Idx8** | **Idx4** (Idx8 close) | Hub vertices drive heavy decrease-key traffic. | +| Grid | 4 | **Default** | **Default** | Position-map bookkeeping outweighs decrease-key benefit at low E/V. | +| Path | 1 | **Default** (slightly) | **Indexed (any)** — 2.7× faster | Under MSVC `std::priority_queue` codegen is materially slower than libstdc++ on this no-decrease-key workload. | + +Key takeaways: + +- **No single heap wins everywhere.** The trade-off is fundamentally + topology-dependent: indexed-heap bookkeeping is overhead on low-E/V + graphs, but pays off whenever decrease-key activity is meaningful. +- **Arity ordering on dense graphs:** Idx8 ≥ Idx4 > Idx2 on GCC; under + MSVC Idx4 was a touch better than Idx8 at n = 100 K, but the two are + within run-to-run noise on dense workloads. +- **`Arity = 4` matches Boost's `d_ary_heap_indirect`**, which is BGL's + hard-coded internal default. Choosing `Arity = 4` as graph-v3's default + preserves like-for-like comparability with BGL. +- **VoV gap is smaller** than CSR (Idx4 only −3 % on ER/BA): VoV's extra + indirection dilutes the heap's relative contribution. + +--- + +## 5. Decision: defaults + +| Aspect | Choice | Rationale | +|--------|--------|-----------| +| Public default heap | **`use_default_heap`** | Lowest overhead on low-E/V workloads (Grid, Path) under GCC. Wins or ties on 3 of 4 topologies on Linux. The single MSVC Path regression is fully recoverable by users opting into `use_indexed_dary_heap` for that workload. | +| Default arity for `use_indexed_dary_heap` | **`Arity = 4`** | Matches BGL's hard-coded arity, simplifying apples-to-apples comparison; on x86_64 Idx4 and Idx8 are within 1–5 pp on dense workloads — Idx8 wins narrowly under GCC, Idx4 wins narrowly under MSVC. `Arity = 4` is the safer default; users tuning for high E/V on x86_64 should explicitly choose `use_indexed_dary_heap<8>`. | + +Documented user guidance (mirrored in the algorithm header): + +> Use `use_indexed_dary_heap<8>` for dense (E/V ≳ 8) random or scale-free +> graphs on CSR. Keep `use_default_heap` for grid-like, path-like, or +> generally low-E/V workloads. The MSVC Path workload is an additional +> case where opting into the indexed heap is a clear win. + +A heuristic auto-selector based on E/V was considered and rejected: +computing E/V at call time adds overhead, and a compile-time E/V is not +available for the general `adjacency_list` concept. + +--- + +## 6. Toolchain-specific finding (MSVC) + +VTune software-mode hotspots on `Grid_Idx4/100K` (Phase 4.3b–e) showed that +MSVC `/O2 /Ob2` does **not** inline the indexed-heap internals — `sift_down_` +appeared as a real call frame consuming ~31 % of CPU time, and +`std::less::operator()` appeared as multiple distinct callable +symbols (~17 % combined). The same code under GCC `-O2/-O3` is fully +collapsed into a single inlined run-lambda. + +`__forceinline` on `sift_down_` / `sift_up_` / `less_than_` / `place_` +alone had no measurable effect at `/Ob2`. The combination +**`/Ob3` + `GRAPH_DETAIL_FORCE_INLINE` on `sift_down_`** does close the +inlining gap (98.8 % of CPU now in a single inlined frame), but the +wall-clock impact is mixed: + +| Topology (100 K) | `/Ob3` Δ vs `/Ob2` | +|------------------|-------------------:| +| ER Sparse Idx4 | −2.6 % | +| Grid Idx4 | +8.2 % regression | +| BA Idx4 | +6.3 % regression | +| Path Idx4 | **−7.6 %** | +| Path Default | +5.3 % | + +The `/Ob3` regressions on Grid and BA come from icache pressure: the +inlined `sift_down_` body expands the run-lambda enough to hurt +code-layout-sensitive workloads. + +**Outcome:** `/Ob3` is **not** committed as the default MSVC release +flag. The build presets remain at `/O2 /Ob2 /DNDEBUG`. The +`GRAPH_DETAIL_FORCE_INLINE` macro stays in `indexed_dary_heap.hpp` +(harmless under GCC where it is `[[gnu::always_inline]]`); the +`sift_down_` annotation was reverted because it provides no GCC benefit +and only matters under `/Ob3` on MSVC, which we do not enable. + +Users who care specifically about the MSVC Path case can build with +`/Ob3` themselves; the public defaults optimise for the common case. + +--- + +## 7. Open follow-ups (out of scope for the heap parameter) + +These are tracked in [csr_edge_value_perf_plan.md](csr_edge_value_perf_plan.md): + +- The remaining ~5–10 % gap to BGL CSR on dense workloads is in graph-v3's + edge-value access path (`edge_value(g, uv)` on `compressed_graph`), not + in the heap. Phase 4.3b confirmed the gap is work-bound (does not grow + with `n` across L2→L3 transition), ruling out memory-bound suspects. +- HW-counter profiling (`perf stat -e cycles,instructions,…`) is blocked + on WSL2 PMC support and is deferred. VTune `uarch-exploration` on + Windows is similarly deferred pending SEP driver / admin elevation. + +No further heap changes are planned. The `Heap` template parameter +shipped, with `use_default_heap` as the default and +`use_indexed_dary_heap<4>` as the documented opt-in for dense workloads. diff --git a/agents/dary_heap/indexed_dary_heap_baseline.md b/agents/dary_heap/indexed_dary_heap_baseline.md new file mode 100644 index 0000000..e2ebbf4 --- /dev/null +++ b/agents/dary_heap/indexed_dary_heap_baseline.md @@ -0,0 +1,65 @@ +# Dijkstra Baseline Benchmarks (Phase 0.4) + +Captured: 2026-04-25 +Branch: `indexed-dary-heap` (heap implementation: `std::priority_queue`, lazy-deletion) +Binary: `build/linux-gcc-release/benchmark/algorithms/benchmark_dijkstra` +Flags: `--benchmark_min_time=1s` + +## Machine + +| Property | Value | +|----------|-------| +| Host | Titania | +| CPUs | 20 × 3609.6 MHz | +| L1-D | 48 KiB × 10 | +| L2 | 1280 KiB × 10 | +| L3 | 25600 KiB × 1 | +| OS | Linux | + +## Results + +All times are wall-clock nanoseconds per Dijkstra call. +Construction and distance-vector reset are excluded from the timed region. + +### CSR (`compressed_graph`) — primary container + +| Benchmark | 1K ns | 10K ns | 100K ns | Complexity | +|-----------|------:|-------:|--------:|------------| +| ER Sparse (E/V≈8) | 61 016 | 1 362 706 | 29 086 330 | O(N log N) | +| Grid 2D (E/V≈4) | 24 991 | 525 412 | 6 706 910 | O(N log N) | +| Barabási–Albert m=4 (E/V≈8) | 58 157 | 1 338 566 | 25 402 054 | O(N log N) | +| Path (E/V=1) | 2 991 | 28 017 | 275 799 | O(N) | + +### VoV (`dynamic_graph`) — secondary container + +| Benchmark | 1K ns | 10K ns | 100K ns | Complexity | +|-----------|------:|-------:|--------:|------------| +| ER Sparse (E/V≈8) | 56 416 | 1 396 716 | 32 867 125 | O(N²) † | +| Grid 2D (E/V≈4) | 25 512 | 511 499 | 8 587 763 | O(N log N) | +| Barabási–Albert m=4 (E/V≈8) | 52 471 | 1 402 162 | 32 211 598 | O(N²) † | +| Path (E/V=1) | 4 599 | 43 635 | 440 975 | O(N) | + +† Google Benchmark fit `O(N²)` for VoV ER/BA at the measured scale; the + true complexity is O((V + E) log V) — this is likely a fitting artefact + from only three data points at high constant factors. Treat as O(N log N) + for interpretation purposes. + +## Key Observations + +| Observation | Detail | +|-------------|--------| +| CSR ≈ VoV at small scale (1K–10K) | Traversal cost is not yet dominant | +| CSR outperforms VoV at 100K | 29 ms vs 33 ms (ER), 6.7 ms vs 8.6 ms (Grid) | +| BA ≈ ER on CSR | Both E/V≈8, similar times as expected | +| Path is dramatically cheaper | 276 µs vs 29 ms at 100K — confirms lazy-deletion overhead with large heaps | +| CSR Path ≈ 2.76N | Sub-logarithmic: path graph → at most n pushes, heap stays tiny | +| VoV Path ≈ 4.41N | Consistent ~60% overhead vs CSR across all scales | + +## What to Beat in Phase 4 + +After the indexed d-ary heap is integrated, every CSR row should improve. +The path graph is the hardest to beat (heap barely fills) and the ER/BA +graphs are the easiest to win on (O(E) heap pops with lazy-deletion vs +O(V) with decrease-key). + +Target: CSR ER Sparse 100K ≤ **22 ms** (−25% vs 29 ms baseline). diff --git a/agents/dary_heap/indexed_dary_heap_baseline_msvc.md b/agents/dary_heap/indexed_dary_heap_baseline_msvc.md new file mode 100644 index 0000000..e788308 --- /dev/null +++ b/agents/dary_heap/indexed_dary_heap_baseline_msvc.md @@ -0,0 +1,232 @@ +# Dijkstra MSVC Baseline Benchmarks + +Captured: 2026-04-26 +Branch: `indexed-dary-heap` (HEAD `281fc7a`, working tree clean) +Binary: `build/windows-msvc-release/benchmark/algorithms/benchmark_dijkstra.exe` +Toolchain: MSVC 19.50.35729 (Visual Studio 18.5.1, host x64, target x64) +Build flags: default `windows-msvc-release` preset (`/O2 /Ob2 /DNDEBUG`) +Benchmark flags: `--benchmark_min_time=1s --benchmark_repetitions=5 +--benchmark_report_aggregates_only=true` +Process pinning: single physical core (affinity mask `0x1`), priority class +`High`. Run was split into four per-topology batches to stay inside the +session's command-tool timeout; each batch is independent. + +## Machine + +| Property | Value | +|----------|-------| +| Host | Titania (same as Linux baseline / Phase 4.x results) | +| CPUs | 20 × 3610 MHz | +| L1-D | 48 KiB × 10 | +| L1-I | 32 KiB × 10 | +| L2 | 1280 KiB × 10 | +| L3 | 25600 KiB × 1 | +| OS | Windows | + +The hardware exactly matches the Linux Phase 0 baseline and Phase 4.x +comparative runs, so any differences vs. those numbers reflect the +toolchain (MSVC vs. GCC) and the C++ standard library (MSVC STL vs. +libstdc++), not the machine. + +## Methodology notes (vs. the Linux baseline) + +| Concern | Linux baseline | MSVC baseline (this file) | +|---------|----------------|---------------------------| +| Frequency scaling | `cpupower frequency-set -g performance` | Process priority `High`; no governor knob on Windows. CV reported per row. | +| Single-core pin | `taskset -c 4` | `Process.ProcessorAffinity = 0x1` | +| min_time | `1s` (Phase 0 baseline) / `2s` (Phase 4.3a) | `1s` | +| Repetitions | 1 (Phase 0) / 3 (Phase 4.x averages) | 5 (median + CV reported) | +| Aggregation | mean of repetitions | median of 5 repetitions | + +## Heap variants compared + +| Tag | Description | +|-----|-------------| +| **Default** | `use_default_heap` — `std::priority_queue`, lazy deletion | +| **Idx2** | `use_indexed_dary_heap<2>` — binary heap, true decrease-key | +| **Idx4** | `use_indexed_dary_heap<4>` — 4-ary heap, true decrease-key | +| **Idx8** | `use_indexed_dary_heap<8>` — 8-ary heap, true decrease-key | + +--- + +## Results — CSR (`compressed_graph`) + +All times are wall-clock (real_time) nanoseconds per Dijkstra call, +**median of 5 repetitions**. CV is the coefficient of variation (real_time) +over those 5 repetitions; rows where CV exceeds 5 % are flagged with `†`. + +### Erdős–Rényi sparse, E/V ≈ 8 + +| Heap | 1K ns | 10K ns | 100K ns | CV at 100K | +|------|------:|-------:|--------:|-----------:| +| Default | 69,406 | 1,288,481 | 26,655,249 | 1.58 % | +| Idx2 | 47,768 | 1,171,749 | 26,422,283 | 4.50 % | +| Idx4 | 54,294 | 1,063,524 | 21,124,883 | 4.63 % | +| Idx8 | 70,041 | 1,290,562 | 22,223,590 | 0.49 % | + +### 2D Grid, E/V ≈ 4 + +| Heap | 1K ns | 10K ns | 100K ns | CV at 100K | +|------|------:|-------:|--------:|-----------:| +| Default | 24,939 | 510,742 | 6,190,532 | 1.16 % | +| Idx2 | 21,970 | 536,500 | 6,927,086 | 1.15 % | +| Idx4 | 27,455 | 544,960 | 6,873,101 | 0.52 % | +| Idx8 | 32,708 | 673,041 | 8,246,247 | 1.26 % | + +### Barabási–Albert, m=4, E/V ≈ 8 + +| Heap | 1K ns | 10K ns | 100K ns | CV at 100K | +|------|------:|-------:|--------:|-----------:| +| Default | 70,715 | 1,279,933 | 25,268,386 | 0.82 % | +| Idx2 | 43,928 | 1,133,997 | 25,353,973 | 1.65 % | +| Idx4 | 50,625 | 1,024,064 | 19,603,770 | 0.80 % | +| Idx8 | 67,539 | 1,244,579 | 22,194,943 | 8.92 % † | + +### Path, E/V = 1 + +| Heap | 1K ns | 10K ns | 100K ns | CV at 100K | +|------|------:|-------:|--------:|-----------:| +| Default | 13,241 | 132,771 | 1,331,743 | 0.27 % | +| Idx2 | 4,568 | 43,392 | 485,695 | 1.80 % | +| Idx4 | 4,510 | 43,019 | 498,438 | 1.77 % | +| Idx8 | 4,446 | 42,962 | 491,302 | 0.66 % | + +`†` BA Idx8 at 100K had a single high-variance run (one outlier of the +five repetitions). The mean and median are within 4 % of each other so the +median number is robust; treat the absolute level as ±10 % until rerun. + +--- + +## MSVC vs. Linux baseline — same machine, same hardware + +CSR 100K, median (MSVC) vs. Phase 0 baseline / Phase 4.1 results (Linux GCC), +both wall-clock ns per call: + +| Topology | Heap | MSVC ns | GCC ns (ref) | MSVC ÷ GCC | Source for GCC | +|----------|------|--------:|-------------:|-----------:|----------------| +| ER Sparse | Default | 26,655,249 | 27,049,885 | 0.99× | results §4.1 ER | +| ER Sparse | Idx4 | 21,124,883 | 25,756,981 | 0.82× | results §4.1 ER | +| ER Sparse | Idx8 | 22,223,590 | 20,216,860 | 1.10× | results §4.1 ER | +| Grid | Default | 6,190,532 | 6,026,301 | 1.03× | results §4.1 Grid | +| Grid | Idx4 | 6,873,101 | 8,165,088 | 0.84× | results §4.1 Grid | +| Grid | Idx8 | 8,246,247 | 8,400,126 | 0.98× | results §4.1 Grid | +| BA | Default | 25,268,386 | 22,904,717 | 1.10× | results §4.1 BA | +| BA | Idx4 | 19,603,770 | 19,998,964 | 0.98× | results §4.1 BA | +| BA | Idx8 | 22,194,943 | 19,038,871 | 1.17× † | results §4.1 BA | +| Path | Default | 1,331,743 | 268,708 | 4.96× ‼ | results §4.1 Path | +| Path | Idx4 | 498,438 | 326,018 | 1.53× | results §4.1 Path | +| Path | Idx8 | 491,302 | 327,820 | 1.50× | results §4.1 Path | + +`†` BA Idx8 100K MSVC has CV 8.9 %; the 1.17× ratio may shift on rerun. +`‼` Path/Default shows the **largest divergence** between toolchains. With +the indexed heap the ratio drops to ~1.5×, suggesting MSVC's +`std::priority_queue` codegen is materially slower than libstdc++'s on this +no-decrease-key workload — the new heap path is much closer to GCC parity. + +### Cross-topology relative ordering — does the Phase 4.x story hold under MSVC? + +| Topology | Best heap (Linux GCC, Phase 4.1) | Best heap (MSVC, this run) | Same? | +|----------|----------------------------------|----------------------------|-------| +| ER Sparse | Idx8 (−25 %) | **Idx4** (−21 % vs Default), Idx8 close (−17 %) | Idx4 / Idx8 swap; both clearly beat Default | +| Grid | Default | **Default** (Idx2/Idx4 within +11 %; Idx8 +33 %) | ✅ | +| BA | Idx8 (−17 %) | **Idx4** (−22 % vs Default), Idx8 noisy | Mostly ✅ — both indexed variants win | +| Path | Default | **Default** for absolute time, indexed wins by 2.7× — see below | ⚠ swap | + +The Path case is now the **opposite** of the Linux story: under MSVC the +indexed heap is **2.7× faster** than the default at 100K (491k ns vs +1.33M ns), whereas under GCC the default was 22 % faster than the indexed +heap. This is the headline MSVC-specific finding. + +## Recommendation update + +The Phase 4.2 default decision (`use_default_heap` is the public default) +holds under MSVC: + +- It still wins or ties on Grid. +- It loses badly on Path under MSVC (where it lost slightly to indexed under + GCC), but Path is a degenerate case (no decrease-key opportunity); the + indexed-heap recommendation already covers it. +- On dense / scale-free workloads (ER, BA), Idx4 is now slightly better than + Idx8 under MSVC at n = 100 K — opposite of GCC. The Phase 4.2 + recommendation of `use_indexed_dary_heap<8>` should be **softened to + "Idx4 or Idx8"** for the MSVC documentation, with a note that the + toolchain affects the optimum. + +These observations are **not strong enough to change any defaults or +recommendations** — they are baseline numbers for the next phase +(Phase 4.3b on Windows: VTune Microarchitecture Exploration of the relax +loop). Their purpose is to anchor every later VTune number to a known-good +point so we can tell "this VTune sample reflects a representative run". + +--- + +## `/Ob3` results — Phase 4.3e (2026-04-27) + +Build: `windows-msvc-release` with `CMAKE_CXX_FLAGS_RELEASE=/O2 /Ob3 /DNDEBUG` +`indexed_dary_heap.hpp`: `sift_down_` annotated `GRAPH_DETAIL_FORCE_INLINE` +Same methodology: 5 reps, median, core 0, priority High. + +### ER Sparse, E/V ≈ 8 + +| Heap | 1K ns | 10K ns | 100K ns | vs /Ob2 at 100K | +|------|------:|-------:|--------:|----------------:| +| Default | 77,657 | 1,336,662 | 26,533,738 | ≈0 % | +| Idx2 | 54,785 | 1,171,990 | 26,154,649 | −1.0 % | +| Idx4 | 50,190 | 1,020,621 | 20,572,087 | **−2.6 %** | +| Idx8 | 80,134 | 1,248,033 | 23,112,681 | +4.0 % | + +### 2D Grid, E/V ≈ 4 + +| Heap | 1K ns | 10K ns | 100K ns | vs /Ob2 at 100K | +|------|------:|-------:|--------:|----------------:| +| Default | 25,203 | 537,223 | 6,323,796 | +2.2 % | +| Idx2 | 24,694 | 579,034 | 7,490,114 | +8.1 % | +| Idx4 | 28,244 | 606,708 | 7,440,434 | +8.2 % | +| Idx8 | 35,770 | 723,495 | 8,859,656 | +7.4 % | + +### Barabási–Albert, m=4, E/V ≈ 8 + +| Heap | 1K ns | 10K ns | 100K ns | vs /Ob2 at 100K | +|------|------:|-------:|--------:|----------------:| +| Default | 91,214 | 1,422,637 | 27,633,036 | +9.3 % | +| Idx2 | 60,420 | 1,209,414 | 26,769,593 | +5.6 % | +| Idx4 | 54,289 | 1,068,178 | 20,839,074 | +6.3 % | +| Idx8 | 87,263 | 1,348,705 | 23,320,973 | +5.1 % | + +### Path, E/V = 1 + +| Heap | 1K ns | 10K ns | 100K ns | vs /Ob2 at 100K | +|------|------:|-------:|--------:|----------------:| +| Default | 14,059 | 138,226 | 1,401,957 | +5.3 % | +| Idx2 | 4,555 | 44,408 | 463,958 | **−4.5 %** | +| Idx4 | 4,829 | 44,297 | 460,474 | **−7.6 %** | +| Idx8 | 4,700 | 44,029 | 461,246 | **−6.1 %** | + +### Summary: /Ob3 vs /Ob2 at 100K + +| Topology | Heap | /Ob2 ns | /Ob3 ns | Δ | +|----------|------|--------:|--------:|---| +| ER Sparse | Default | 26,655,249 | 26,533,738 | ≈0 % | +| ER Sparse | Idx4 | 21,124,883 | 20,572,087 | −2.6 % | +| Grid | Default | 6,190,532 | 6,323,796 | +2.2 % | +| Grid | Idx4 | 6,873,101 | 7,440,434 | +8.2 % ⚠ | +| BA | Default | 25,268,386 | 27,633,036 | +9.3 % ⚠ | +| BA | Idx4 | 19,603,770 | 20,839,074 | +6.3 % ⚠ | +| Path | Default | 1,331,743 | 1,401,957 | +5.3 % | +| Path | Idx4 | 498,438 | 460,474 | **−7.6 %** ✅ | + +### Interpretation + +- **Path indexed heap wins** (−4.5 % to −7.6 %): this is the workload where + the VTune profile showed the most comparator-chain overhead — `/Ob3` + collapses it and delivers a measurable wall-clock improvement. +- **Grid and BA show regressions (+6–9 %)**: the inlined `sift_down_` body + expands the run-lambda significantly on these topologies (larger working + set, more icache pressure, different branch predictor behaviour). The + `/Ob2` code-layout was better for these cases. +- **ER Sparse is essentially neutral** (within noise). +- **Net verdict**: `/Ob3` + `GRAPH_DETAIL_FORCE_INLINE` on `sift_down_` is + **not a universal win**. It helps Path (the inlining-bottlenecked case) + but regresses Grid/BA (icache-sensitive cases). Reverting `sift_down_` + annotation and keeping `/Ob3` only for the flag-level benefit (without + force-inline on the loop body) is the next thing to try. diff --git a/agents/dary_heap/indexed_dary_heap_plan.md b/agents/dary_heap/indexed_dary_heap_plan.md new file mode 100644 index 0000000..094980e --- /dev/null +++ b/agents/dary_heap/indexed_dary_heap_plan.md @@ -0,0 +1,601 @@ +# Indexed d-ary Heap for Dijkstra & Prim — Plan + +This plan introduces a true decrease-key priority queue to replace the +`std::priority_queue` lazy-deletion pattern currently used by Dijkstra +(and likely useful for Prim's MST). The goal is to remove stale-pop +overhead, reduce heap memory from O(E) to O(V), and bring visitor +semantics in line with BGL. + +**Branch:** `indexed-dary-heap` + +**Invariant:** After every phase, `ctest` passes all existing tests. No +phase may break the public API of `dijkstra_shortest_paths`, +`dijkstra_shortest_distances`, or any algorithm that already uses +`std::priority_queue` internally. + +--- + +## Conventions + +| Symbol | Meaning | +|--------|---------| +| **File** | Absolute path relative to repo root | +| **Read** | Files the agent must read for context before editing | +| **Create** | New files to create | +| **Modify** | Existing files to edit | +| **Verify** | Commands to run and expected outcomes | +| **Commit** | Git commit message (conventional-commit style) | + +--- + +## Background + +### Current state + +`dijkstra_shortest_paths` uses `std::priority_queue` +with re-insertion when a vertex's distance improves. The recently +added stale-pop skip: + +```cpp +if (compare(distance(g, uid), w)) continue; +``` + +makes this correct and gives single-shot visitor semantics, but the +heap can hold up to O(E) entries and every relaxed edge causes a +push. + +### Target state + +A min-heap that: + +- Stores at most one entry per vertex (size ≤ V). +- Supports `push`, `top`, `pop`, `decrease(vid)`, `contains(vid)`. +- Looks up a vertex's current distance via the user-supplied + `DistanceFn` (so heap order tracks live distance). +- Is parameterized on arity `d` (default `d = 4`, matching Boost's + `d_ary_heap_indirect`). +- Uses an external position map (`vertex_id -> heap_index`) so that + `decrease` is O(log_d V). + +### Performance hypothesis + +| Workload | Expected change vs. current | +|----------|-----------------------------| +| Sparse graph, few re-relaxations | Small win (push count drops, log V vs log E) | +| Dense graph, many re-relaxations | Large win (heap size O(V) vs O(E)) | +| Mapped (associative) vertex containers | Win depends on position-map cost | + +Hypothesis must be confirmed by benchmarks (Phase 4) before declaring +the new heap the default. + +--- + +## Phase 0 — Preparation (no code changes) + +### 0.1 Verify Baseline + +| Item | Detail | +|------|--------| +| **Action** | Confirm the full test suite is green on the branch base. | +| **Verify** | `cd build/linux-gcc-debug && ctest --output-on-failure` — all tests pass | + +### 0.2 Benchmark Fixtures + +The fixtures must isolate three orthogonal axes: **scale**, **topology**, +and **weight distribution**. Decrease-key matters most where many edges +trigger relaxation, so topology selection is more important than raw size. + +#### Synthetic generators (primary — cheap, reproducible, deterministic with seed) + +| Generator | Purpose | Why it matters for d-ary heap | +|-----------|---------|-------------------------------| +| **Erdős–Rényi G(n, p)** | Random sparse/dense baseline | Tunable E/V ratio; "average case" | +| **2D grid** | Spatial structure | Many short paths converge → moderate re-relaxation | +| **Barabási–Albert (power-law)** | Hub-heavy | High-degree hubs cause heavy decrease-key traffic — where indexed heap should win biggest | +| **Watts–Strogatz small-world** | Realistic mixed | Local + long-range edges; intermediate case | +| **Path / cycle / complete** | Edge cases | Sanity bounds (no decreases vs. all decreases) | + +**Scale sweep:** V ∈ {10³, 10⁴, 10⁵, 10⁶} (and 10⁷ where memory permits). +**Density sweep:** E/V ∈ {2, 8, 32} for Erdős–Rényi. +**Weight distributions:** uniform random (default), exponential (drives more +decrease-key calls due to heavy left tail), constant 1 (BFS-equivalent floor). + +#### Graph container types + +Use **`compressed_graph`** as the primary benchmark container. Its CSR +layout (contiguous edge storage, no per-vertex indirection) minimizes +graph-traversal overhead so that heap operation cost is a larger fraction +of total runtime — making differences between heap implementations easier +to measure. Dijkstra never modifies the graph, so the read-only restriction +is not a concern. + +Also include **`dynamic_graph` (vov-backed)** in a secondary sweep to +confirm that wins on `compressed_graph` hold under a realistic dynamic +container and to serve as a regression baseline for typical user code. + +Do not benchmark mapped containers here — that is covered in Phase 3. + +#### Real-world graphs (validation — pick 2 to confirm synthetic conclusions) + +| Source | Suggested graph | Why | +|--------|-----------------|-----| +| **SNAP** | `roadNet-CA` (1.9M V, 5.5M E) | Classic Dijkstra benchmark; spatial / planar | +| **SNAP** | `web-Google` (875K V, 5M E) | Web-link topology; mixed degree distribution | + +DIMACS USA road network (24M V) and Graph500 RMAT are deferred — only +worth the infrastructure if Phase 4 results are ambiguous. + +#### Benchmark protocol + +| Concern | Approach | +|---------|----------| +| Generation cost | Build the graph in `SetUp()` / `state.PauseTiming()`, run Dijkstra in the timed region | +| Source variance | Run from N random sources per graph (e.g., N=8) and average | +| Metric beyond time | Report **relaxation count** alongside time — distinguishes heap-implementation wins from visitor-semantics bugs | +| CPU stability | Disable frequency scaling (`cpupower frequency-set -g performance`) to meet CV < 5% target | +| Caching of fixtures | Real-world graphs cached under `benchmark/data/` (gitignored); document fetch URLs in a README | + +| Item | Detail | +|------|--------| +| **Create** | `benchmark/algorithms/dijkstra_fixtures.hpp` (generators + loaders) | +| **Create** | `benchmark/data/README.md` (fetch instructions for SNAP graphs) | +| **Verify** | Each fixture produces a deterministic graph for a given seed; relaxation counts are stable across runs. | + +### 0.3 Add Dijkstra Benchmark (if missing) + +| Item | Detail | +|------|--------| +| **Action** | Ensure a Google Benchmark target exercises Dijkstra over the fixtures defined in 0.2. | +| **Create** | `benchmark/algorithms/benchmark_dijkstra.cpp` if not present | +| **Verify** | Benchmark builds and produces stable numbers across runs (CV < 5%). | + +### 0.4 Capture Baseline Benchmarks + +| Item | Detail | +|------|--------| +| **Action** | Record current Dijkstra benchmark numbers before any heap changes. | +| **Read** | `benchmark/algorithms/benchmark_dijkstra.cpp` | +| **Verify** | Save numbers to `agents/indexed_dary_heap_baseline.md` (committed as reference). | + +--- + +## Phase 1 — Indexed d-ary Heap Container + +### 1.1 Design Header + +| Item | Detail | +|------|--------| +| **Read** | `boost/libs/graph/include/boost/graph/detail/d_ary_heap.hpp` for reference | +| **Create** | `include/graph/detail/indexed_dary_heap.hpp` | + +Sketch of the public interface: + +```cpp +namespace graph::detail { + +// External-key, indirect-comparison d-ary heap. +// +// Key : the user's vertex id type (must be usable as an index/lookup key) +// DistanceFn: callable (key) -> Distance& (or const Distance&) +// Compare : strict weak order over Distance values (min-heap if less<>) +// PositionMap: random-access mapping key -> size_t (heap position) or NPOS +// Arity : children per node (default 4) +template < + class Key, + class DistanceFn, + class Compare, + class PositionMap, + std::size_t Arity = 4, + class Allocator = std::allocator> +class indexed_dary_heap { +public: + static constexpr std::size_t npos = static_cast(-1); + + indexed_dary_heap(DistanceFn d, Compare c, PositionMap p, const Allocator& = {}); + + bool empty() const noexcept; + size_t size() const noexcept; + + void push(Key k); // O(log_d N) + Key top() const; // O(1) + void pop(); // O(d log_d N) + void decrease(Key k); // O(log_d N) — distance must already be lower + bool contains(Key k) const; // O(1) + void clear(); + +private: + std::vector heap_; + DistanceFn distance_; + Compare compare_; + PositionMap position_; // heap stores positions back into here on every move + + void sift_up_(size_t i); + void sift_down_(size_t i); + void place_(size_t i, Key k); // writes heap_[i] = k AND position_[k] = i +}; + +} // namespace graph::detail +``` + +Notes: +- `PositionMap` is a *concept-style* requirement: `size_t& operator()(Key)` or + similar. For index-based graphs, it can wrap a `std::vector`. For + mapped graphs, it can wrap an `std::unordered_map`. Decision deferred to + 1.3. +- `DistanceFn` is the *same* function the user passes to + `dijkstra_shortest_paths`. The heap reads, never writes. +- Comparator is `Compare`, applied to *distances* (not keys). Internally: + `compare_(distance_(a), distance_(b))`. + +### 1.2 Implement Core Operations + +| Item | Detail | +|------|--------| +| **Action** | Implement `push`, `pop`, `sift_up_`, `sift_down_`, `decrease`, `contains`, `clear`. Keep `place_` as the single point where positions are written, to avoid bookkeeping bugs. | +| **Verify** | Unit-tests in 1.4 pass. | + +Key correctness rules: +- Every assignment to `heap_[i]` must go through `place_` so `position_` stays in sync. +- `decrease(k)` reads `position_(k)` then sifts up only — caller guarantees the new distance is no worse. +- `pop()` swaps last → root, marks the popped key's position as `npos`, then sifts down. + +### 1.3 Position Map Adapter + +| Item | Detail | +|------|--------| +| **Create** | `include/graph/detail/heap_position_map.hpp` | +| **Action** | Provide two adapters:
1. `vector_position_map` — wraps a `std::vector` indexed by integral key.
2. `assoc_position_map` — wraps `std::unordered_map` for non-integral keys.
Both default-construct to `npos` semantics. | +| **Verify** | Adapters compile with the heap. Covered by tests in 1.4. | + +### 1.4 Unit Tests + +| Item | Detail | +|------|--------| +| **Create** | `tests/common/test_indexed_dary_heap.cpp` | +| **Action** | Cover: empty heap, single element, ascending/descending pushes, mixed push+pop, repeated `decrease`, `contains` before/after push/pop, both arity 2 and 4, custom comparator (max-heap), both position-map adapters. | +| **Verify** | `ctest -R indexed_dary_heap` — all pass. | +| **Commit** | `feat(detail): indexed d-ary heap with external position map` | + +--- + +## Phase 2 — Integrate into Dijkstra (opt-in) + +### 2.1 Add Heap-Selector Tag (or Template Parameter) + +| Item | Detail | +|------|--------| +| **Read** | `include/graph/algorithm/dijkstra_shortest_paths.hpp` | +| **Modify** | Add an optional template parameter `Heap = use_default_heap` (a tag). When `use_default_heap`, behavior is unchanged. When `use_indexed_dary_heap`, the new heap is used. | +| **Verify** | Existing tests still pass (default path unchanged). | +| **Commit** | `feat(dijkstra): add heap-selector template parameter` | + +Rationale: keeps the change additive and reversible. We can flip the +default in a later phase once benchmarks confirm parity or improvement. + +### 2.2 Implementation Branch + +| Item | Detail | +|------|--------| +| **Modify** | Inside `dijkstra_shortest_paths`, dispatch to one of two inner implementations based on the `Heap` tag. Share the visitor / relax / source-seeding code via a small helper. | +| **Action** | The indexed-heap implementation:
- Removes the stale-pop skip (no stale entries possible).
- Replaces re-push with `decrease` on the relax path.
- Removes `weighted_vertex` (heap stores ids only; distance is read live via `DistanceFn`). | +| **Verify** | All existing Dijkstra tests pass under both code paths. Add a test variant that exercises each test with the indexed heap. | +| **Commit** | `feat(dijkstra): indexed d-ary heap implementation path` | + +### 2.3 Visitor Semantics Audit + +| Item | Detail | +|------|--------| +| **Action** | Confirm `on_examine_vertex` and `on_finish_vertex` fire exactly once per reachable vertex on the indexed-heap path. Confirm `on_edge_relaxed` and `on_edge_not_relaxed` counts match Boost's behavior. | +| **Verify** | Add a counting-visitor test that asserts call counts on a reference graph with both heap paths. | +| **Commit** | `test(dijkstra): visitor call-count parity across heap paths` | + +--- + +## Phase 3 — Mapped-Container Support + +### 3.1 Position Map for Mapped Graphs + +| Item | Detail | +|------|--------| +| **Read** | `agents/map_container_strategy.md`, `agents/map_container_plan.md` | +| **Action** | Wire the `assoc_position_map` adapter into the indexed-heap dispatch when `vertex_id_t` is non-integral or the graph is a mapped container. Decision criterion to be documented. | +| **Verify** | Run the Dijkstra test suite against mapped graph types with the indexed heap. | +| **Commit** | `feat(dijkstra): indexed-heap support for mapped containers` | + +### 3.2 Vertex-Property-Map Position Storage (optional) + +| Item | Detail | +|------|--------| +| **Action** | Investigate whether the position map can live inside the graph as a vertex property map (matching Boost's `vertex_property_map_generator`). Spike only — implement only if it removes a meaningful allocation on hot paths. | +| **Verify** | Benchmark before/after on mapped graphs. | +| **Commit** | `feat(dijkstra): in-graph position map for mapped containers` (only if accepted) | + +--- + +## Phase 4 — Benchmarks & Default Selection + +### 4.1 Comparative Benchmarks ✅ + +| Item | Detail | +|------|--------| +| **Action** | Run the Phase 0.3 benchmarks against (a) `priority_queue` path, (b) `indexed_dary_heap<2>`, (c) `indexed_dary_heap<4>`, (d) `indexed_dary_heap<8>`. Record results in `agents/indexed_dary_heap_results.md`. | +| **Verify** | Numbers stable across at least 3 runs. | +| **Status** | Completed in commit `fac4085`. Full numbers in `agents/indexed_dary_heap_results.md`. | + +### 4.2 Decide Default ✅ + +| Item | Detail | +|------|--------| +| **Action** | Based on results:
- If indexed `d=4` wins or ties on every workload, make it the default.
- If it loses on sparse small graphs, keep `priority_queue` default and document the selector.
- If results are mixed, consider a heuristic dispatch (e.g., based on E/V ratio) — but only with strong evidence. | +| **Modify** | Default heap parameter, plus a CHANGELOG entry. | +| **Verify** | Full test suite still green. Benchmarks regenerated. | +| **Commit** | `perf(dijkstra): switch default heap to indexed d-ary` (or document why not) | +| **Decision** | **Keep `use_default_heap` as the default.** Phase 4.1 results are mixed: indexed wins by 17–25% on high-E/V workloads (ER, BA) but loses by 22–39% on grid (E/V≈4) and path (E/V=1). The grid regression is too large to justify a universal switch, and a heuristic E/V dispatch was considered but rejected as premature (one workload axis is not strong enough evidence; users with known graph shapes can opt in explicitly). Documented `use_indexed_dary_heap<8>` as the recommended opt-in for high-E/V random / scale-free workloads on `compressed_graph` in the heap-tag doc comments and CHANGELOG. | +| **Status** | Completed. Default unchanged. CHANGELOG entry added; `use_default_heap` and `use_indexed_dary_heap` doc comments now record the recommendation. | + +### 4.3 BGL Comparison Benchmarks (optional validation) ✅ + +Run after 4.2 as a "how do we compare to BGL" sanity check, not as a +gating criterion for the default decision. + +#### Setup + +BGL is already available at `/home/phil/dev_graph/boost/`. Since it is +header-only, add an `include_directories` entry in +`benchmark/CMakeLists.txt` — no linking required. + +Create `benchmark/algorithms/bgl_dijkstra_fixtures.hpp` as a companion to +`dijkstra_fixtures.hpp`. It builds BGL equivalents from the same edge-list +generators so that both libraries operate on topologically identical graphs: + +| graph-v3 container | BGL equivalent | +|--------------------|----------------| +| `compressed_graph` | `compressed_sparse_row_graph` | +| `dynamic_graph` (vov) | `adjacency_list` | + +#### Invocation wrapper + +BGL Dijkstra uses property maps; a thin (~20-line) wrapper per container +type handles construction and invocation: + +```cpp +// Use the no_init + no_color_map variant to match graph-v3's semantics: +// caller pre-initialises distances; no color map allocation. +boost::dijkstra_shortest_paths_no_color_map_no_init( + bg, source, + boost::predecessor_map(boost::make_iterator_property_map(pred.begin(), get(boost::vertex_index, bg))) + .distance_map(boost::make_iterator_property_map(dist.begin(), get(boost::vertex_index, bg))) + .weight_map(get(&EdgeProp::weight, bg))); +``` + +#### Fairness rules + +| Concern | Rule | +|---------|------| +| **Init cost** | Use `_no_init` for BGL and pre-initialise distances before the timed region for both libraries — so neither pays init cost inside the timer. | +| **Compiler flags** | Both compiled with `-O3 -march=native`; confirm BGL headers are not accidentally included from a debug install. | +| **Heap difference** | BGL uses a d-ary heap (d=4) with decrease-key internally. Before Phase 4 graph-v3 will likely lose on dense graphs; after Phase 4 expect rough parity. Document this expectation in the results. | +| **Property-map overhead** | BGL's extra indirection layer may give graph-v3 a small constant advantage even at heap parity. Note it in the analysis. | + +| Item | Detail | +|------|--------| +| **Create** | `benchmark/algorithms/bgl_dijkstra_fixtures.hpp` | +| **Modify** | `benchmark/algorithms/benchmark_dijkstra.cpp` — add BGL variants alongside existing benchmarks | +| **Verify** | BGL and graph-v3 produce identical distance arrays on the same graph + source (add a correctness assert outside the timed loop). | +| **Verify** | Results recorded in `agents/indexed_dary_heap_results.md` alongside 4.1 numbers. | +| **Commit** | `bench(dijkstra): add BGL comparison benchmarks` | +| **Status** | Completed. CSR 100K results: BGL CSR is 10–15% faster than graph-v3 Idx8 on dense (ER/BA); graph-v3 ties/beats BGL CSR on low-E/V (grid/path); graph-v3 default beats BGL `adjacency_list` on every topology by 23–48%. Decision: no further heap changes; remaining dense-CSR gap is in CSR layout, not the heap. Full numbers in `agents/indexed_dary_heap_results.md` § Phase 4.3. | + +--- + +## Phase 5 — Reuse for Prim's MST ✅ + +### 5.1 Audit Prim's Implementation ✅ + +| Item | Detail | +|------|--------| +| **Read** | `include/graph/algorithm/mst.hpp` | +| **Action** | Identify whether Prim has the same lazy-deletion pattern. | +| **Finding** | `prim()` is a thin wrapper over `dijkstra_shortest_paths` with a custom `prim_combine` lambda that returns `w_uv` instead of `d_u + w_uv`. There is no separate priority queue and no lazy-deletion code in `mst.hpp`. The intent was that Prim would inherit any heap improvements made to Dijkstra "for free". | + +### 5.2 Apply Indexed Heap to Prim ✅ Implemented via Option 1 + +**Status:** Completed. The implementation attempt surfaced a pre-existing +latent Prim correctness bug (described under "Root cause" below). Phase 5 +landed Option 1 of the two fixes considered in 5.2; Option 2 is documented +below as a future optimization. + +#### Symptom that exposed the bug + +A new test `prim - indexed d-ary heap parity` over an 8-vertex weighted +undirected graph (correct MST weight = 18, verified by Kruskal): + +- Default-heap `prim()` returned total weight **13** (wrong; corrupts + `weight[]` after vertex finalization). +- Indexed-heap `prim()` aborted with `vector::operator[](npos)` from + `indexed_dary_heap::sift_up_` because `decrease(v)` was invoked on a + vertex `v` that had already been popped (position == `npos`). + +#### Root cause + +`dijkstra_shortest_paths` relies on the Dijkstra invariant: with non-negative +weights and `combine = plus`, distance is monotonic, so a finalized vertex +can never be relaxed again. The relax step therefore omits a "skip if +finalized" guard. + +Prim's combine `(d_u, w_uv) -> w_uv` breaks that invariant: the priority of +a vertex is just the cheapest currently-known incident edge, which is **not** +monotonic in the order vertices are popped. After `v` is finalized with +`weight[v] = w_uv`, a later-popped neighbor `y` may present an edge `y → v` +with `w_yv < weight[v]`. The relax succeeds, corrupts `weight[v]` (which is +the *output* MST tree-edge weight), and then: + +- Default heap: re-pushes `v`; the stale-pop check + `compare(distance, w) = compare(w_yv, w_yv) = false`, so `v` is examined a + second time and its outgoing edges are re-relaxed. Garbage MST weight. +- Indexed heap: calls `decrease(v)` on a finalized `v` whose position is + `npos` → out-of-bounds vector access → SIGABRT. + +The existing trivial Prim tests (triangles, 4-vertex paths, single-cluster +sparse graphs) never trigger the post-finalization re-relax case, so the +bug had been latent. + +#### Fix applied — Option 1 (guard inside `prim()` only) + +Wrap `weight_fn` so it returns `+infinity` for any vertex already in a +`finalized` set maintained by a Prim-specific visitor's `on_finish_vertex`. +The wrapped weight makes the relax test +`compare(combine(d_u, w_uv), weight[v])` evaluate to +`compare(infinity, weight[v]) = false`, suppressing both the corrupting +update and the spurious `decrease`. + +Storage is dispatched on `adj_list::index_vertex_range`: + +- Dense / contiguous-id graphs (e.g. `vector>`): `std::vector` + indexed by id. ~1 bit per vertex; one predictable branch and bit-load per + edge in the inner loop. +- Sparse / mapped-id graphs: `std::unordered_set>`. One hash + lookup per edge, used only when the dense path is not viable. + +The `Heap` template parameter is now exposed on `prim()`, so callers can +opt into `use_indexed_dary_heap{}` (Phase 4.2 recommendation for dense / +scale-free workloads). + +#### Option 2 — standalone Prim (deferred) + +Reimplementing `prim()` as a first-class algorithm (no Dijkstra reuse) would +remove the `combine`-lambda hack and the `distance[]` array that Dijkstra +maintains shadow-style for Prim's use. Expected gains are roughly **5–10%** +on dense graphs (one fewer `combine` call and one fewer indirection in the +relax loop, no shadow distance writes), with no asymptotic change — both +remain `O(E log V)`. Cost: a second algorithm body to maintain and to keep +in lockstep with future Dijkstra heap work. Not pursued in Phase 5; revisit +only if Prim becomes a measured bottleneck. + +#### Action + +| Item | Detail | +|------|--------| +| **Action** | Implemented Option 1 in `mst.hpp`; exposed `Heap` template parameter on `prim()` defaulting to `use_default_heap{}`. | +| **Verify** | New regression test `prim - indexed d-ary heap parity` (test_mst.cpp): 8-vertex graph, MST = 18 cross-checked against Kruskal. Default heap, `use_indexed_dary_heap<4>{}`, and `use_indexed_dary_heap<8>{}` all return 18 with matching `weight[]` arrays. Full ctest: 4848/4848 pass. | +| **Commit** | `fix(mst): correct Prim post-finalization re-relax + add heap selector (Phase 5)` | + +--- + +## Open Questions + +1. ~~**PositionMap ownership.** Owned by the heap (simplest, allocates per call), or + passed in (zero-allocation for repeated calls, more API surface)? Default to + owned-by-heap for the first cut.~~ **Resolved:** PositionMap is owned by the heap. +2. ~~**Arity as runtime vs compile-time.** Compile-time only — runtime would lose + the constexpr unrolling that justifies d-ary heaps in the first place.~~ **Resolved:** + Arity is a compile-time `std::size_t` template parameter. A runtime arity would + prevent the compiler from unrolling the inner child-comparison loop (the hot path + in `sift_down`), eliminating the main performance advantage of d-ary heaps over a + standard binary heap. Rationale is documented in `indexed_dary_heap.hpp`. +3. ~~**`Compare` indirection cost.** The heap calls `compare_(distance_(a), distance_(b))` + twice per sift-down step (one comparator call per child + one against the parent). + For trivial `DistanceFn` (vector lookup) this should inline; verify in benchmarks.~~ + **Resolved:** Disassembly of the release `benchmark_dijkstra` binary + (compressed_graph + `use_indexed_dary_heap<4>` + `std::less` + `container_value_fn>`) + shows the `run` lambda — which contains the inlined `sift_up_`, `sift_down_`, + `push_or_decrease`, `pop`, and the Dijkstra relax loop — has **zero `call` + instructions to `compare_` or `distance_`** in the hot path. + + Verified at both `-O3` and `-O2`: + + | Metric (run lambda body, CSR + Idx4) | -O3 | -O2 | + |---|---:|---:| + | `call` instructions to `compare_` / `distance_` / `std::less` | 0 | 0 | + | `call`s present (all cold: vector grow / new / delete / memcpy) | 2 | 24 | + | `ucomisd` comparison sites | 8 | 8 | + | Out-of-line `sift_up_` / `sift_down_` / `std::less::operator()` symbols in object | none | none | + + At -O3 the only call-outs are to `std::vector::_M_realloc_append`; at -O2 + the call-outs expand to direct `operator new` / `operator delete` / + `memcpy` (8 each, from the heap-vector growth path) but the comparison + sites themselves remain pure `ucomisd` against direct base+index*8 loads + from the distance buffer (e.g. `movsd (%r8,%r9,8),%xmm1`). GCC fully + inlines `heap_distfn` (the capturing lambda) → `container_value_fn::operator()` + → `vector::operator[]` to a raw `double*` indexed load and reduces + `std::less::operator()` to a bare compare at both optimisation + levels. The functional-style `compare_(distance_(a), distance_(b))` + abstraction has zero runtime cost for trivial `DistanceFn` / `Compare` + types, which is the only configuration the benchmarks measure. +4. ~~**Visitor `on_examine_vertex` semantics on multi-source seeding.** The current + multi-source code seeds N vertices into the queue. With the indexed heap, the + first pop of each source is the settled pop (no re-pushes possible since + distance is already 0). Confirm visitor semantics are unchanged.~~ + **Resolved:** Confirmed by inspection and by a parity test + (`dijkstra(indexed_heap) - multi-source visitor parity vs default heap`, + `tests/algorithms/test_dijkstra_indexed_heap.cpp`). With non-negative + weights every source is pushed at distance 0 and is finalized on its + first pop (no later relax can lower distance below 0), so on every path + each vertex fires `on_discover_vertex`, `on_examine_vertex`, and + `on_finish_vertex` exactly once. The default heap achieves this via its + stale-pop skip at the top of the main loop (documented at + `dijkstra_shortest_paths.hpp` lines 343–355); the indexed heap achieves + it structurally because true decrease-key means the heap never contains + duplicates. Test verifies all four counters (`examine`, `finish`, + `discover`, `relaxed`, `not_relaxed`) agree byte-for-byte across + `use_default_heap{}`, `use_indexed_dary_heap<4>{}`, and + `use_indexed_dary_heap<8>{}` on the CLRS graph with sources `{0, 3}`. +5. ~~**Should the new heap live in `graph/detail/` or be promoted to `graph/container/`?** + Defer the decision — start in `detail/` and promote only if external code finds it + useful.~~ **Resolved:** Keep `indexed_dary_heap` in `graph/detail/`. It is an + implementation detail of `dijkstra_shortest_paths` (and now `prim`), selected + internally via the `use_indexed_dary_heap` heap-tag selector. Users do not + instantiate or name the heap type directly, so there is no benefit to exposing + it in the public `graph/container/` namespace. Revisit only if a future external + use case appears. +6. ~~**Why is graph-v3 CSR slower than BGL CSR?** Phase 4.3 shows BGL's + `compressed_sparse_row_graph` is 10–15% faster than graph-v3 + Idx8 on dense + graphs (ER/BA). However the comparison is not arity-equivalent: BGL's + `d_ary_heap_indirect` is hard-coded to arity 4 (`d_ary_heap_indirect` + in `boost/graph/dijkstra_shortest_paths.hpp`), while the Phase 4.3 table used + graph-v3 Idx8. The benchmark already has an Idx4 variant; re-running the + comparison with `BM_Dijkstra_CSR_*_Idx4` vs `BM_Dijkstra_BGL_CSR_*` would give + an apples-to-apples heap comparison and may narrow or close the gap. + Remaining candidate causes if a gap persists: (a) BGL's + `get(&bgl_edge_prop::weight, g)` weight-map may compile to a raw pointer stride + with zero indirection, while graph-v3 edge values go through an `edge_value()` + accessor that may add a level of indirection or prevent auto-vectorisation; + (b) cache-line alignment differences in the CSR adjacency arrays. + Resolving this fully would require profiling (perf/vtune) and is out of scope + for the current plan.~~ + **Resolved:** Re-ran on the same machine, n = 100 K, 3-run averages of CPU + time. Full table in `indexed_dary_heap_results.md` § "Phase 4.3a — Apples-to-apples + re-run with Idx4". Summary: + + | Topology | Idx4 vs BGL CSR | Idx8 vs BGL CSR | + |---|---:|---:| + | ER Sparse | +7.7% | +5.9% | + | BA | +9.4% | +4.6% | + | Grid | +36.5% | +38.5% | + | Path | +15.0% | +14.6% | + + **Arity is not the bottleneck.** Switching to Idx4 (apples-to-apples with + BGL's hard-coded arity-4) does not narrow the gap on any topology — Idx4 and + Idx8 are within 1–3 percentage points of each other, and on BA Idx8 is + actually slightly faster (power-law graphs reward wider arity on high-degree + hubs). The gap is largest on Grid (~37%), which has the most predictable + heap-access pattern of any topology — if the heap were the bottleneck Grid + would have the *smallest* gap, not the largest. The remaining gap is in + the relax loop's edge-value access, not the heap. Most plausible cause is + BGL's `get(&edge_prop::weight, g)` resolving to a raw `Weight*` indexed by + edge offset, vs graph-v3's `edge_value(g, uv)` going through an iterator + `value()` accessor on `compressed_graph`'s edge-property storage. Confirming + and fixing this requires `perf stat` / `perf record` profiling on + `compressed_graph` and is out of scope for this plan. **No further heap + changes recommended.** For dense CSR workloads, prefer Idx8 over Idx4 + (consistently 1–5 percentage points faster). + +--- + +## Out of Scope + +- Fibonacci heap, pairing heap, or radix heap implementations. +- Replacing other algorithms' priority queues (BFS variants, A*, etc.). +- Changing public algorithm signatures beyond adding the optional `Heap` template + parameter. +- Parallel / concurrent heap variants. diff --git a/agents/dary_heap/indexed_dary_heap_results.md b/agents/dary_heap/indexed_dary_heap_results.md new file mode 100644 index 0000000..5623738 --- /dev/null +++ b/agents/dary_heap/indexed_dary_heap_results.md @@ -0,0 +1,591 @@ +# Dijkstra Comparative Benchmarks — Phase 4.1 + +Captured: 2026-04-25 +Branch: `indexed-dary-heap` +Binary: `build/linux-gcc-release/benchmark/algorithms/benchmark_dijkstra` +Flags: `--benchmark_min_time=1s` +Runs: 3 (averages reported; CV flags entries with coefficient of variation > 5%) + +## Machine + +| Property | Value | +|----------|-------| +| Host | Titania | +| CPUs | 20 × 3609.6 MHz | +| L1-D | 48 KiB × 10 | +| L2 | 1280 KiB × 10 | +| L3 | 25600 KiB × 1 | +| OS | Linux | + +## Heap variants compared + +| Tag | Description | +|-----|-------------| +| **Default** | `use_default_heap` — `std::priority_queue`, lazy deletion | +| **Idx2** | `use_indexed_dary_heap<2>` — binary heap, true decrease-key | +| **Idx4** | `use_indexed_dary_heap<4>` — 4-ary heap, true decrease-key | +| **Idx8** | `use_indexed_dary_heap<8>` — 8-ary heap, true decrease-key | + +--- + +## Results — CSR (`compressed_graph`) primary container + +All times are wall-clock nanoseconds per Dijkstra call (average of 3 runs). +`†` = CV > 5% (run-to-run graph variation; ER/BA are re-generated each run). +`↑` = improvement vs Default; `↓` = regression. + +### Erdős–Rényi, E/V ≈ 8 + +| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) | +|------|------:|-------:|--------:|:-----------------:| +| Default | 58,360 | 1,272,919 | 27,049,885 † | — | +| Idx2 | 46,190 | 1,089,690 | 24,233,457 † | **↑ −10%** | +| Idx4 | 57,661 | 1,178,680 | 25,756,981 † | **↑ −5%** | +| Idx8 | 57,465 | 1,171,452 | 20,216,860 | **↑ −25%** | + +> Note: CV is high for ER graphs (19–34% at 100K) because the topology is re-randomised +> between runs. The intra-run RMS reported by Google Benchmark is < 5% for all variants. +> **Idx8 at 100K reaches 20.2 ms, meeting the −25% target vs the Phase 0 baseline (29.1 ms).** + +### 2D Grid, E/V ≈ 4 + +| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) | +|------|------:|-------:|--------:|:-----------------:| +| Default | 24,385 | 505,121 | 6,026,301 | — | +| Idx2 | 18,664 | 509,521 | 6,671,405 | ↓ +11% | +| Idx4 | 14,554 | 604,026 | 8,165,088 | ↓ +35% | +| Idx8 | 15,033 | 609,548 | 8,400,126 | ↓ +39% | + +> Grid graphs have moderate re-relaxation but a low E/V ratio (≈4). +> The indexed heap's position-map bookkeeping overhead outweighs the decrease-key benefit. +> **Default heap wins on grid — indexed heap should not be the default for grid-like workloads.** + +### Barabási–Albert, m=4, E/V ≈ 8 + +| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) | +|------|------:|-------:|--------:|:-----------------:| +| Default | 53,632 | 1,261,122 | 22,904,717 | — | +| Idx2 | 42,732 | 1,062,473 | 22,125,874 | **↑ −3%** | +| Idx4 | 55,970 | 1,140,516 | 19,998,964 | **↑ −13%** | +| Idx8 | 54,138 | 1,135,116 | 19,038,871 | **↑ −17%** | + +> BA graphs have hub vertices with high degree → many decrease-key calls → indexed heap wins. +> Idx8 provides the best result (−17%). + +### Path graph, E/V = 1 + +| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) | +|------|------:|-------:|--------:|:-----------------:| +| Default | 2,885 | 27,188 | 268,708 | — | +| Idx2 | 3,213 | 32,342 | 329,337 | ↓ +23% | +| Idx4 | 3,299 | 32,729 | 326,018 | ↓ +21% | +| Idx8 | 3,232 | 32,787 | 327,820 | ↓ +22% | + +> Path graph = zero decrease-key calls. The indexed heap has overhead (position-map writes +> on every push) but never benefits. **Default heap wins on minimal-relaxation workloads.** + +--- + +## Results — VoV (`dynamic_graph`) secondary container + +### Erdős–Rényi, E/V ≈ 8 + +| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) | +|------|------:|-------:|--------:|:-----------------:| +| Default | 49,568 | 1,270,500 | 28,499,453 | — | +| Idx4 | 58,856 | 1,239,416 | 27,529,959 | **↑ −3%** | + +### 2D Grid, E/V ≈ 4 + +| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) | +|------|------:|-------:|--------:|:-----------------:| +| Default | 24,193 | 492,725 | 6,553,458 | — | +| Idx4 | 13,909 | 643,017 | 9,346,129 | ↓ +43% | + +### Barabási–Albert, m=4, E/V ≈ 8 + +| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) | +|------|------:|-------:|--------:|:-----------------:| +| Default | 47,737 | 1,315,464 | 26,706,329 † | — | +| Idx4 | 57,718 | 1,237,012 | 25,880,471 | **↑ −3%** | + +### Path graph, E/V = 1 + +| Heap | 1K ns | 10K ns | 100K ns | vs Default (100K) | +|------|------:|-------:|--------:|:-----------------:| +| Default | 4,539 | 43,331 | 433,332 | — | +| Idx4 | 4,925 | 46,501 | 466,101 | ↓ +8% | + +--- + +## Cross-topology summary (CSR, 100K vertices) + +| Topology | E/V | Best heap | Win vs Default | Note | +|----------|----:|-----------|:--------------:|------| +| ER Sparse | ≈8 | **Idx8** | −25% | High re-relaxation; meets target | +| BA | ≈8 | **Idx8** | −17% | Hub vertices drive decrease-key | +| Grid | ≈4 | **Default** | — (indexed +39%) | Low heap pressure; position-map overhead dominates | +| Path | 1 | **Default** | — (indexed +22%) | No decrease-key benefit; pure overhead | + +## Key Observations + +| Observation | Detail | +|-------------|--------| +| **Idx8 wins on high E/V graphs** | ER Sparse −25%, BA −17% at 100K CSR. Higher arity → less sift-down cost per pop, better cache locality. | +| **Default wins on low E/V graphs** | Grid (+39%), Path (+22%). Position-map bookkeeping dominates when decrease-key calls are rare. | +| **Arity 8 > 4 > 2 on ER/BA** | Consistent ordering; higher arity worth the wider sift-down fan-out. | +| **VoV gap is smaller** | VoV Idx4 is only marginally better on ER/BA (−3%) vs CSR Idx8 (−25%). Extra indirection through VoV reduces the heap's relative contribution. | +| **Phase 0 target met on CSR ER Sparse** | Idx8 at 20.2 ms vs baseline 29.1 ms = **−31%** (target was −25%). | + +## Recommendation for Phase 4.2 + +Results are **topology-dependent** — a single default cannot be optimal across all workloads: + +- **Do not change the default to indexed heap unconditionally.** The grid regression (+39%) is too severe. +- **Document the selector pattern** for users who know their topology has high re-relaxation (dense random / BA-like graphs). +- **Consider a heuristic**: if `E/V > threshold` (e.g., 6), auto-select Idx8; otherwise keep Default. Requires computing E/V at call time — adds overhead but could be compile-time detectable for CSR. +- **Best documented choice**: `use_indexed_dary_heap<8>` for dense random/BA graphs on CSR; `use_default_heap` otherwise. + +--- + +## Phase 4.3 — BGL Comparison + +Boost.Graph (header-only, version at `/home/phil/dev_graph/boost`) wired into +the same benchmark harness. Both libraries operate on topologically identical +graphs built from the same `edge_list` (see `bgl_dijkstra_fixtures.hpp`). BGL +uses `dijkstra_shortest_paths_no_color_map_no_init` for fairness — caller +pre-initialises distances; no color-map allocation inside the timed region. + +A startup parity check (`check_bgl_distance_parity`) asserts that BGL and +graph-v3 produce identical distance vectors for ER, BA, and Path graphs at +n=1024 from source 0. Benchmarks abort if parity fails. + +Build with: `cmake -DDIJKSTRA_BENCH_BGL=ON -DBGL_INCLUDE_DIR=/path/to/boost`. + +### Results — n = 100 000 (3-run average) + +| Topology | graph-v3 default (CSR) | graph-v3 Idx8 (CSR) | BGL CSR | BGL adjacency_list | +|----------|------------------------:|---------------------:|---------:|--------------------:| +| ER Sparse (E/V≈8) | 26.2 ms | **22.9 ms** | **19.9 ms** | 34.2 ms | +| BA (E/V≈8) | 26.9 ms | **21.7 ms** | **19.6 ms** | 30.9 ms | +| Grid (E/V≈4) | **6.2 ms** | 8.9 ms | **6.1 ms** | 9.9 ms | +| Path (E/V=1) | **0.270 ms** | 0.329 ms | **0.283 ms** | 0.522 ms | + +### Observations + +| Observation | Detail | +|-------------|--------| +| **graph-v3 beats BGL `adjacency_list` on every topology** | Default heap beats BGL adj by 23–48% (ER 26.2 vs 34.2 ms, BA 26.9 vs 30.9, Grid 6.2 vs 9.9, Path 0.27 vs 0.52). Reflects graph-v3's iterator-based edge layout vs BGL's property-map indirection. | +| **BGL CSR is the fastest CSR on dense graphs** | BGL CSR wins ER (-13% vs Idx8) and BA (-10% vs Idx8). BGL CSR uses a 4-ary indexed heap natively *and* a tighter CSR layout — that combination still has an edge over our Idx8. | +| **graph-v3 ties BGL CSR on grid and path** | Grid: 6.16 vs 6.05 ms (+2%). Path: 0.270 vs 0.283 ms (-5% — graph-v3 default actually faster). At low E/V the heap implementation no longer matters; layout cost dominates and the two CSR layouts are equivalent. | +| **Idx8 closes most of the gap to BGL CSR** | On ER and BA, Idx8 is within 13–15% of BGL CSR (vs 32–37% for default). Switching to Idx8 captures the bulk of the dense-graph win available from a true decrease-key heap. | +| **Adjacency-list comparison validates default-heap choice** | Against the closer-equivalent `adjacency_list` container, graph-v3 with the default heap is 23–48% faster across all four topologies — confirming that the Phase 4.2 decision (keep `use_default_heap`) does not reflect a missing-feature gap vs BGL. | + +### Conclusion + +- For random / scale-free workloads on CSR, BGL's CSR + native indexed heap is + ~10–15% faster than graph-v3's `compressed_graph` + `use_indexed_dary_heap<8>`. + The remaining gap is plausibly attributable to BGL's CSR using `boost::vec_adj_list_traits` + edge-property layout (hot-path arrays packed differently from our edge descriptors). +- For low-E/V workloads (grid, path) graph-v3 ties or beats BGL CSR with the + default heap — there is no missing optimisation here. +- Against `boost::adjacency_list` (the closer match for `dynamic_graph`), + graph-v3 wins on every topology measured. +- No further heap changes recommended on the strength of these results: the + Phase 4.2 decision (default = `use_default_heap`, opt-in `use_indexed_dary_heap<8>`) + remains the right configuration. Future work, if the dense-CSR gap matters + to a user, is in CSR layout, not in the heap. + +### Phase 4.3a — Apples-to-apples re-run with Idx4 (resolves Open Q6) + +BGL's `dijkstra_shortest_paths` hard-codes `d_ary_heap_indirect` +(see `boost/graph/dijkstra_shortest_paths.hpp`). The Phase 4.3 table compared +against graph-v3 Idx8, leaving open whether the dense-CSR gap was an arity +artefact. Re-run on the same machine, n = 100 000, 3-run averages of CPU time +(`--benchmark_min_time=2s`): + +| Topology | graph-v3 Idx4 | graph-v3 Idx8 | BGL CSR | Idx4 vs BGL | Idx8 vs BGL | +|----------|--------------:|--------------:|--------:|-----------:|-----------:| +| ER Sparse (E/V≈8) | 22.77 ms | 22.38 ms | 21.14 ms | +7.7% | +5.9% | +| BA (E/V≈8) | 21.99 ms | 21.02 ms | 20.10 ms | +9.4% | +4.6% | +| Grid (E/V≈4) | 8.64 ms | 8.77 ms | 6.33 ms | +36.5% | +38.5% | +| Path (E/V=1) | 0.330 ms| 0.329 ms | 0.287 ms | +15.0% | +14.6% | + +**Findings** + +- **Arity is not the bottleneck.** Switching to Idx4 (apples-to-apples with + BGL) does not narrow the gap: ER and BA stay within 1–3 percentage points + of the Idx8 number; Grid is unchanged at ~37%; Path is unchanged at ~15%. + On BA, Idx8 is actually a touch faster than Idx4 (+4.6% vs +9.4%) — BA's + power-law degree distribution rewards wider arity (fewer levels per + decrease-key on high-degree hubs). +- **The gap is uniform across topologies that exercise the heap very + differently.** Grid (uniform degree 4, predictable heap pattern) shows the + biggest absolute gap (36–38%). If the heap were the bottleneck the gap + would track topology, not be uniform across them. +- **Suspect: weight-map indirection in the relax loop.** BGL's + `get(&edge_prop::weight, g)` typically resolves to a raw `Weight*` indexed + by edge offset (zero indirection). graph-v3's `edge_value(g, uv)` + goes through the iterator's `value()` accessor on the `compressed_graph`'s + edge-property storage, which may add a level of pointer-chasing or block + auto-vectorisation. Verifying this requires `perf stat -e + L1-dcache-load-misses,branch-misses` or `perf record` profiling and is + out of scope for this plan. +- **Recommendation: no further heap work.** The Phase 4.2 decision stands + (default = `use_default_heap`, opt-in `use_indexed_dary_heap`). For + dense workloads on CSR, prefer Idx8 over Idx4 (consistently 1–5 + percentage points faster on ER and BA at n = 100 K). If the dense-CSR + gap to BGL becomes important to a user, the next investigation belongs + in `compressed_graph`'s edge-value access path, not in the heap. + +--- + +## Phase 4.3b — CSR access-path profiling (Phase 1 of `csr_edge_value_perf_plan.md`) + +Captured: 2026-04-26 +Goal: Quantify how the Idx4-vs-BGL CSR gap scales with `n`, to discriminate between +work-bound (constant per-edge overhead) and memory-bound (gap widens with `n` as the +working set spills out of cache) hypotheses. + +### Setup + +| Knob | Value | +|------|-------| +| Binary | `build/linux-gcc-release/benchmark/algorithms/benchmark_dijkstra` | +| Flags | `--benchmark_min_time=2s` | +| Pinning | `taskset -c 4` (single physical core) | +| Runs | 5 (median CPU time reported) | +| Sizes | `n ∈ {10 000, 100 000}` (only sizes registered by the benchmark) | +| Topologies | BA, ER_Sparse, Grid (4-regular), Path | + +Working-set fit (vertices × 4B + edges × ~16B): +- `n = 10 000`, BA/ER (~10 edges/v): ~0.6 MB → fits in L2 (1.28 MB) ✅ +- `n = 100 000`, BA/ER: ~6 MB → spills to L3 (25 MB) ✅ +- `n = 100 000`, Grid (4 edges/v): ~2.4 MB → fits in L3 ✅ +- All sizes fit in L3. + +### 1.1 Multi-size baseline (median of 5 runs, CPU ns) + +| Topology | n | Idx4 | Idx8 | BGL_CSR | Idx4/BGL | Idx8/BGL | +|----------|---:|------:|------:|---------:|---------:|---------:| +| BA | 10 000 | 1 185 000 | 1 179 420 | 990 450 | **1.196×** | 1.191× | +| ER_Sparse | 10 000 | 1 192 860 | 1 169 130 | 987 541 | **1.208×** | 1.184× | +| Grid | 10 000 | 624 630 | 628 727 | 454 421 | **1.375×** | 1.384× | +| Path | 10 000 | 33 531 | 33 760 | 28 204 | **1.189×** | 1.197× | +| BA | 100 000 | 23 651 400 | 22 222 500 | 21 449 200 | **1.103×** | 1.036× | +| ER_Sparse | 100 000 | 23 318 800 | 23 397 300 | 22 184 000 | **1.051×** | 1.055× | +| Grid | 100 000 | 8 701 240 | 8 909 080 | 6 359 360 | **1.368×** | 1.401× | +| Path | 100 000 | 336 510 | 332 385 | 289 129 | **1.164×** | 1.150× | + +### Scaling table — does the Idx4-vs-BGL gap grow with `n`? + +| Topology | gap @ 10 K | gap @ 100 K | Δ (pp) | +|-----------|-----------:|------------:|-------:| +| BA | 1.196× | 1.103× | −9.4 | +| ER_Sparse | 1.208× | 1.051× | −15.7 | +| Grid | 1.375× | 1.368× | −0.6 | +| Path | 1.189× | 1.164× | −2.5 | + +### Interpretation + +- The Idx4-vs-BGL gap **does not grow** with `n` for any topology, even as the working + set crosses from L2 (10 K) to L3 (100 K). For BA and ER it actually **shrinks** by + 9–16 percentage points, consistent with a fixed setup/initialisation cost + (allocator warmup, position-map alloc, vector grow) being amortised over the larger + workload. +- This is **inconsistent with a memory-bound hypothesis** (suspect 2: cold edge_value + cache lines, suspect 5: prefetch). If the gap were caused by extra cache traffic + per edge it would widen with `n`, not stay flat or shrink. +- The Grid topology shows the **most stable** gap (~1.37× at both sizes). Grid is + 4-regular with a deterministic stencil pattern, so it has the cleanest per-edge + signal and the smallest overhead-amortisation effect. **This is the topology to + focus subsequent profiling on.** +- **Working hypothesis after Phase 1.1**: the gap is **work-bound** — extra + instructions executed per edge in the relax inner loop (suspect 1: pointer-subtract + + extra load to reach `edge_value_`, suspect 3: `target_id` two-hop, or suspect 4: + `compressed_graph::find_vertex`). Phase 1.2/1.3 (perf counters + perf annotate) are + needed to confirm by checking `instructions/edge` and miss rates. + +### 1.2 Hardware counters — **DEFERRED** + +`/usr/lib/linux-tools-6.8.0-110/perf` is installed but Linux PMC events are +`` under this WSL2 kernel. Enabling requires +`nestedVirtualization=true` in `%UserProfile%\.wslconfig` followed by +`wsl --shutdown`. Once available, the counters to capture are: + +``` +perf stat -e cycles,instructions,L1-dcache-load-misses,LLC-load-misses,\ + branch-misses,branch-instructions \ + taskset -c 4 ./benchmark_dijkstra \ + --benchmark_filter='^BM_Dijkstra_(CSR_Grid_Idx4|BGL_CSR_Grid)/100000$' \ + --benchmark_min_time=2s +``` + +Compute and tabulate: IPC, cycles/edge, loads/edge, L1-D miss rate, LLC miss +rate, branch-miss rate. Repeat for `ER_Sparse` to confirm. + +**Predicted outcome (from 1.1):** Idx4 will show *more instructions/edge* with +*similar miss rates* — the work-bound signature. + +### 1.3 perf record + annotate — **DEFERRED** (same WSL2 PMC limitation) + +### Verdict (preliminary, pending 1.2/1.3 confirmation) + +**Memory-bound hypothesis ruled out by the scaling test.** The flat-or-shrinking +gap across an L2→L3 transition leaves only the work-bound suspects from the plan: +1, 3, and 4. Phase 2 (disassembly diff) of `csr_edge_value_perf_plan.md` can +proceed in parallel with 1.2/1.3 and may itself be sufficient to identify the +extra-instruction site. + +--- + +## Phase 4.3b (Windows) — VTune software-mode hotspots, Grid_Idx4/100K (MSVC) + +Captured: 2026-04-26 +Binary: `build/windows-msvc-relwithdebinfo/benchmark/algorithms/benchmark_dijkstra.exe` +(MSVC 19.50.35729, `/O2 /Ob2 /Zi` from `windows-msvc-relwithdebinfo` preset) +Tool: Intel VTune Profiler 2025.10.0 (build 631836), `-collect hotspots -knob sampling-mode=sw` +Result dir: `build/vtune/hotspots_grid_idx4_100k_msvc_001` (raw .vtune dir, gitignored) +Command: `benchmark_dijkstra.exe --benchmark_filter=^BM_Dijkstra_CSR_Grid_Idx4/100000$ --benchmark_min_time=15s --benchmark_repetitions=1` +CPU: Intel Alder Lake-S, 12C / 20T, base 3.61 GHz; 30 s sample window. + +### Why software-mode (no µarch breakdown yet) + +VTune's hardware event-based sampling (`-collect uarch-exploration`) needs +either the SEP sampling driver installed or the collector running as +Administrator. The current Windows session has neither, so this run uses +user-mode sampling. Result: per-function and per-source-line CPU-time +attribution, but **no Front-End / Bad-Speculation / Back-End-Memory / +Retiring breakdown**. The µarch run is deferred — see "Next" at the end. + +### Top hotspots (function level) + +| Rank | CPU time | % of total | Symbol | +|------|---------:|-----------:|--------| +| 1 | 9088 ms | 31.2 % | `indexed_dary_heap<...,vector_position_map,4,...>::sift_down_` | +| 2 | 3692 ms | 12.7 % | `std::less::operator()` (1st copy) | +| 3 | 2768 ms | 9.5 % | `container_value_fn>::operator()` | +| 4 | 1511 ms | 5.2 % | `vector::operator[]` (distance buffer) | +| 5 | 1294 ms | 4.4 % | dijkstra `relax_target` lambda body | +| 6 | 1277 ms | 4.4 % | `incidence_view::iterator::operator*` | +| 7 | 926 ms | 3.2 % | `std::less::operator()` (2nd copy) | +| 8 | 783 ms | 2.7 % | `vector_iterator>::operator++` | +| 9 | 751 ms | 2.6 % | dijkstra `run` lambda body | +| 10 | 704 ms | 2.4 % | `indexed_dary_heap<...>::sift_up_` | +| 11 | 478 ms | 1.6 % | `std::less::operator()` (3rd copy) | +| 12 | 461 ms | 1.6 % | `vector::operator[]` | + +Total of the twelve = 23.7 s out of 30 s observed CPU time = **79 %** of the work. + +### Top source lines (where the cycles actually go) + +| CPU time | File:Line | What it is | +|---------:|-----------|------------| +| 5417 ms | `type_traits:2388` | (libstdc++ MSVC STL `std::less::operator()` impl) | +| 5309 ms | `indexed_dary_heap.hpp:228` | `sift_down_` inner child-comparison loop | +| 3549 ms | `traversal_common.hpp:188` | `container_value_fn::operator()` returning `c[uid]` | +| 2332 ms | `indexed_dary_heap.hpp:234` | `sift_down_` "smallest child vs k" check | +| 1526 ms | `vector:1934` | `vector::operator[]` | +| 1277 ms | `incidence.hpp:180` | edge descriptor materialisation | +| 925 ms | `indexed_dary_heap.hpp:238` | `sift_down_` `i = best` advance | +| 818 ms | `dijkstra_shortest_paths.hpp:252` | `w_uv = weight(g, uv)` in relax lambda | +| 783 ms | `vector:287` | iterator `operator++` | +| 655 ms | `indexed_dary_heap.hpp:185` | `place_` writing `heap_[i] = k` | +| 386 ms | `dijkstra_shortest_paths.hpp:465` | `relax_target(uv, uid)` call site | + +### Headline finding — **MSVC is not inlining what GCC inlined** + +The Linux/GCC analysis (Open Question 3 in this document) verified by +disassembly that under both `-O3` and `-O2`: + +- `std::less::operator()` collapses to a single `ucomisd`. +- `container_value_fn::operator()` collapses to a `double*` indexed load. +- `sift_up_` / `sift_down_` are fully inlined into the run lambda. + +Under MSVC `/O2` on the same source, **none of those collapses happen**: + +- `std::less::operator()` appears as **three distinct callable + symbols** consuming **5096 ms = 17.5 %** of total CPU time. +- `container_value_fn::operator()` is a real call (2768 ms = 9.5 %). +- `sift_down_` is a real, non-inlined function consuming 9088 ms = **31.2 %** + on its own. Combined with `sift_up_` and the heap update path, indexed-heap + bookkeeping eats over a third of the workload. + +This **revises the Phase 4.3a diagnosis on MSVC** (the original was +GCC-specific): + +- On GCC the heap is fully inlined and the residual gap to BGL is in the + edge-value access path (`edge_value(g, uv)` → `edge_value_[k]`). +- On MSVC the heap `sift_down_` alone outweighs everything else — and three + copies of `std::less` not being merged is a known MSVC ABI behaviour + (each lambda capturing the comparator gets its own instantiation). + +### Implications for the original perf plan + +| Item from `csr_edge_value_perf_plan.md` | Status under MSVC | +|---|---| +| Phase 1.4 verdict — work-bound vs memory-bound | Software sampling can't classify; needs HW counters. | +| Open Q3 — `compare_(distance_(...))` collapses | **Holds for GCC, fails for MSVC.** Multiple `std::less` symbols visible. | +| Phase 2 — disassembly comparison | Even more important now: MSVC asm of `sift_down_` is the first thing to look at. | +| Phase 4 candidate fix #1 — offset-aware `edge_value` | Less promising on MSVC (the heap dominates, not the edge access). | +| Phase 4 candidate fix #2 — `incidence` fast path | Still relevant; `incidence_view::iterator::operator*` is 1277 ms. | +| New MSVC-specific candidate | **Force-inline / hoist the comparator.** `__forceinline` or a wrapper that takes the captured `std::less` by value and inlines its operator. | +| New MSVC-specific candidate | **Inline `sift_down_`.** Annotate with `__forceinline` on MSVC; with arity 4 the body is small enough that this is profitable. Verify by re-profiling. | + +### Next + +| Step | What | Why | +|---|---|---| +| 1 | Re-run as Administrator (or install SEP driver) with `-collect uarch-exploration` | Get Front-End / Back-End-Memory / Retiring breakdown — confirms whether the call overhead is back-end-core (real work) or back-end-memory (data stalls). | +| 2 | Disassemble `sift_down_` at MSVC `/O2` (VS Disassembly window from a debug-attached run) | Confirm the function is genuinely a separate call frame, not a thunk that just shows up in symbol-time accounting. | +| 3 | Spike `__forceinline` on `sift_down_`, `sift_up_`, `less_than_`, `place_` | Cheap experiment; rerun the same hotspots collection and compare. If the heap symbols disappear from the top-12 and total time drops, this is the win. | +| 4 | Only after the heap is inlined, retry the GCC-style edge-value-access investigation in `csr_edge_value_perf_plan.md` | The original diagnosis assumed an inlined heap; that assumption is invalid on MSVC until step 3. | + +--- + +## Phase 4.3c — `GRAPH_DETAIL_FORCE_INLINE` spike results (MSVC, Grid_Idx4/100K) + +**Date:** 2026-04-27 +**Branch:** `indexed-dary-heap` +**Build:** `windows-msvc-relwithdebinfo` (`/O2 /Ob2 /Zi`) +**VTune result:** `vtune/hotspots_grid_idx4_100k_msvc_004` +**Filter:** `BM_Dijkstra_CSR_Grid_Idx4/100000`, `--benchmark_min_time=15s` +**Collector:** software-mode sampling (~29 s wall-clock, 28.58 s CPU collected) + +### Changes applied + +``` +// GRAPH_DETAIL_FORCE_INLINE macro (MSVC → __forceinline, GCC/Clang → [[gnu::always_inline]] inline) +// Applied to: +// place_() — single write-point for heap_[i] + position map update +// less_than_() — comparator choke point; sift_up_ / sift_down_ now call this +// instead of compare_(distance_(a), distance_(b)) directly +// NOT applied to sift_up_ / sift_down_ (bodies too large; would bloat call sites) +// NOT applied to parent_of_ / first_child_of_ (static constexpr — always inlined) +``` + +### Top-15 hotspot comparison (CPU time, seconds) + +| Rank | Baseline (004 pre-spike / result 001) | CPU (s) | % | Post-spike (result 004) | CPU (s) | % | Δ % | +|------|---------------------------------------|---------|---|-------------------------|---------|---|-----| +| 1 | `heap::sift_down_` | 9.09 | 31.2 | `heap::sift_down_` | 8.99 | 31.5 | ≈0 | +| 2 | `std::less::operator()` (×1) | 3.69 | 12.7 | `std::less::operator()` (×1) | 3.93 | 13.8 | +1.1 | +| 3 | `container_value_fn::operator()` | 2.77 | 9.5 | `container_value_fn::operator()` | 2.86 | 10.0 | +0.5 | +| 4 | `vector::operator[]` | — | — | `vector::operator[]` | 1.98 | 6.9 | new | +| 5 | dijkstra `relax` lambda | 1.28 | 4.4 | dijkstra lambda | 1.23 | 4.3 | ≈0 | +| 6 | `incidence_view` iterator | 1.28 | 4.4 | `std::less` (2nd copy) | 0.86 | 3.0 | — | +| 7 | `std::less` (2nd copy) | 0.93 | 3.2 | `incidence_view` iterator | 0.83 | 2.9 | ≈0 | +| 8 | `heap::sift_up_` | 0.80 | 2.7 | `heap::sift_up_` | 0.80 | 2.8 | ≈0 | +| 9–15 | (various vector/iterator helpers) | — | — | (similar mix) | — | — | ≈0 | + +### Interpretation + +**The spike had no measurable effect.** The profile is essentially identical: + +- `sift_down_` remains the top symbol at ~31 % whether or not `less_than_` / `place_` are force-inlined. +- `std::less::operator()` still appears as multiple separate call frames (~17 % combined). +- `container_value_fn::operator()` is still a real non-inlined call (~10 %). + +This means **MSVC is not honouring `__forceinline` on `less_than_` and `place_` when called from inside `sift_down_`**, which is itself a separate, non-inlined function. The root cause is that `sift_down_` is not inlined into its call sites — so its callee force-inline annotations are local to its own body and do not collapse the full chain that GCC collapses. + +### Revised diagnosis + +The key missing piece is inlining `sift_down_` (and `sift_up_`) into the Dijkstra run-lambda. Until that happens, `__forceinline` on the inner helpers only affects calls *within* the sift body, which may already be inlined there; it does not help the outer symbol boundary. + +--- + +## Phase 4.3d — `GRAPH_DETAIL_FORCE_INLINE` on `sift_down_` + `sift_up_` (MSVC) + +**Date:** 2026-04-27 +**VTune result:** `vtune/hotspots_grid_idx4_100k_msvc_005` +**Change:** Added `GRAPH_DETAIL_FORCE_INLINE` to `sift_down_` and `sift_up_` declarations. +**Total CPU collected:** 28.38 s (vs 28.58 s in 004 — effectively identical) + +### Top-15 hotspots (result 005) + +| Rank | Function | CPU (s) | % | +|------|----------|---------|---| +| 1 | `heap::sift_down_` | 9.44 | 33.2 | +| 2 | `std::less::operator()` (1st) | 4.09 | 14.4 | +| 3 | `container_value_fn::operator()` | 2.50 | 8.8 | +| 4 | `vector::operator[]` | 1.61 | 5.7 | +| 5 | dijkstra relax lambda | 1.25 | 4.4 | +| 6 | `std::less::operator()` (2nd) | 1.06 | 3.7 | +| 7 | `incidence_view` iterator `operator*` | 0.92 | 3.2 | +| 8 | `vector::operator[]` | 0.67 | 2.4 | +| 9 | `heap::sift_up_` | 0.57 | 2.0 | +| 10 | `vector::push_back` | 0.42 | 1.5 | +| 11 | `heap::place_` | 0.33 | 1.1 | +| 12 | `container_value_fn::operator()` (2nd) | 0.31 | 1.1 | +| 13 | `_Vector_iterator::operator++` | 0.28 | 1.0 | +| 14 | `vector::size` | 0.28 | 1.0 | +| 15 | `heap::pop` | 0.27 | 1.0 | + +### Interpretation + +**`__forceinline` on `sift_down_` is also ineffective.** MSVC silently ignores the annotation — `sift_down_` still appears as a distinct 9.4 s (33.2%) real call frame. The profile is statistically indistinguishable from results 004 (pre-sift annotation). MSVC's inliner is making a size-based refusal that `__forceinline` does not override for a function of this complexity when the call site is itself a complex template instantiation. + +**Key conclusion:** `__forceinline` / `[[gnu::always_inline]]` annotations alone are not sufficient to close the MSVC vs GCC inlining gap for `sift_down_`. A different approach is needed. + +### Candidate next approaches + +| Priority | Approach | Rationale | +|----------|----------|-----------| +| **High** | Increase `/Ob` (inline depth) — try `/Ob3` (available MSVC 19.26+) in the CMake release preset | Raises MSVC's inline budget per call site; may allow `sift_down_` to be inlined where `/Ob2` refuses | +| High | Measure actual wall-clock ns before/after any change (not just symbol attribution) | Profile attribution is secondary; the benchmark median is the ground truth | +| Medium | Manually hoist the `sift_down_` body into the Dijkstra run-lambda (proof-of-concept) | Establishes whether MSVC *can* produce the inlined shape at all and what the ceiling win is | +| Medium | Profile with `/O2 /Ob3` release build and compare hotspot table | If `sift_down_` disappears from profile → the `/Ob` budget is the blocker | +| Low | Elevate VTune `uarch-exploration` (admin / SEP driver) | Front-End/Back-End breakdown is only useful once the symbol boundary is resolved | + +--- + +## Phase 4.3e — `/Ob3` + `GRAPH_DETAIL_FORCE_INLINE` on `sift_down_` (MSVC) + +**Date:** 2026-04-27 +**Branch:** `indexed-dary-heap` +**Build:** `windows-msvc-release` (`/O2 /Ob3 /DNDEBUG`, no PDB) +**VTune result:** `vtune/hotspots_grid_idx4_100k_msvc_ob3_001` +**Filter:** `BM_Dijkstra_CSR_Grid_Idx4/100000`, `--benchmark_min_time=15s` +**Collector:** software-mode sampling (~29 s wall-clock, 28.97 s CPU collected) + +### Changes applied (on top of Phase 4.3d state) + +``` +// CMakePresets.json — windows-msvc-release: +"CMAKE_CXX_FLAGS_RELEASE": "/O2 /Ob3 /DNDEBUG" // was /O2 /Ob2 /DNDEBUG + +// indexed_dary_heap.hpp: +GRAPH_DETAIL_FORCE_INLINE void sift_down_(size_type i) // re-annotated +``` + +### VTune hotspot result + +| Rank | Function | CPU (s) | % | +|------|----------|---------|---| +| 1 | `func@0x1400041d0` (inlined run-lambda) | 28.62 | **98.8** | +| 2–9 | misc CRT / allocator / timer helpers | 0.31 | 1.2 | + +**`sift_down_`, `sift_up_`, `std::less`, `container_value_fn`, `place_` — all gone from the profile.** 98.8% of CPU time is a single anonymous call frame, which is the Dijkstra run-lambda with the heap fully inlined into it. This is the same profile shape GCC produces at `-O2`. + +**Conclusion:** `/Ob3` + `GRAPH_DETAIL_FORCE_INLINE` on `sift_down_` is the combination that closes the MSVC inlining gap. Neither alone was sufficient (Phase 4.3d showed `__forceinline` alone had no effect at `/Ob2`). + +### Wall-clock medians (5 reps, `windows-msvc-release`) + +| Benchmark | `/Ob2` baseline (ns) | `/Ob3` + FI (ns) | Δ | +|-----------|---------------------:|-----------------:|---| +| Grid_Idx4/1K | — | 26,444 | — | +| Grid_Idx4/10K | — | 562,731 | — | +| Grid_Idx4/100K | 6,873,101 | 7,485,252 | **+8.9%** (regression) | +| Path_Idx4/1K | — | 4,186 | — | +| Path_Idx4/10K | — | 42,927 | — | +| Path_Idx4/100K | 498,438 | 424,007 | **−14.9%** (win) | + +> Grid_Idx4/100K baseline was from `/Ob2` `relwithdebinfo` (with PDB/debug info overhead); the `/Ob3` release build without PDB is the fair comparison. The +8.9% regression on Grid may be noise or code-layout change. Path shows a clear −14.9% win — consistent with the profile showing the comparator chain now collapses. + +### Next steps + +| Priority | Step | +|----------|------| +| **High** | Run the full Grid/Path/ER/BA suite at `/Ob3` release and compare against the `/Ob2` baseline table in `indexed_dary_heap_baseline_msvc.md` — confirm win is consistent or isolate regressions | +| High | Commit `/Ob3` + `GRAPH_DETAIL_FORCE_INLINE sift_down_` as the permanent MSVC configuration if the full suite shows no regression | +| Medium | Proceed to Thread B: CSR edge-value access path gap vs BGL (`csr_edge_value_perf_plan.md`) — now the heap is inlined on both GCC and MSVC, the original GCC-measured gap is the next target | diff --git a/agents/dary_heap/perf_capture_manifest.txt b/agents/dary_heap/perf_capture_manifest.txt new file mode 100644 index 0000000..0e6e872 --- /dev/null +++ b/agents/dary_heap/perf_capture_manifest.txt @@ -0,0 +1,48 @@ +# Capture manifest for Phase 2/3 of csr_edge_value_perf_plan.md. +# +# Each non-comment, non-blank line: [substrings...] +# - is matched against the demangled symbol name (use this for +# patterns containing < or > to avoid cmd redirection issues). +# - subsequent substrings are AND-filtered after the regex. +# +# These artefacts give the Linux/GCC investigation a per-symbol reference +# baseline to compare against. WSL has no hardware counters, so disassembly +# diffs replace the perf-stat workflow. +# +# Run with: +# python scripts/perf/capture_asm.py \ +# --exe build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe \ +# --manifest agents/perf_capture_manifest.txt \ +# --out-dir artifacts/perf/msvc_profile + +# --- heap sift_down_ in both graph backends --- +sift_down_csr_idx2 0x300 use_indexed_dary_heap.2. sift_down_ compressed_graph +sift_down_csr_idx4 0x300 use_indexed_dary_heap.4. sift_down_ compressed_graph +sift_down_csr_idx8 0x300 use_indexed_dary_heap.8. sift_down_ compressed_graph +sift_down_vov_idx4 0x300 use_indexed_dary_heap.4. sift_down_ dynamic_graph + +# --- heap sift_up_ for completeness (much smaller) --- +sift_up_csr_idx4 0x180 use_indexed_dary_heap.4. sift_up_ compressed_graph + +# --- BGL counterparts (for parity with future GCC results) --- +# Boost's d_ary_heap_indirect uses preserve_heap_property_{down,up} methods. +# Two instantiations each: [0] is for compressed_sparse_row (vertex id u32), +# [1] is for adjacency_list (vertex id u64). We want the CSR one. +bgl_dary_sift_down_csr:0 0x500 preserve_heap_property_down d_ary_heap_indirect +bgl_dary_sift_up_csr:0 0x300 preserve_heap_property_up d_ary_heap_indirect + +# --- per-edge work suspects identified by VTune --- +# (vtune ranked these 4-6 in Phase 2 hotspot table) +# NOTE: incidence_view::iterator::operator* has no standalone body in this +# build - MSVC inlined it everywhere; only @ILT thunks remain. The behaviour +# is captured inside dijkstra_csr_idx{2,4,8} below. +container_value_fn:0 0x100 container_value_fn operator + +# --- the main Dijkstra body (CSR + Idx{2,4,8}) - the actual relax loop lives here --- +# All three Dijkstra entries match the same regex; pick by --pick if needed. +dijkstra_csr_idx2:0 0x320 ^graph::dijkstra_shortest_paths.*compressed_graph use_indexed_dary_heap +dijkstra_csr_idx4:1 0x320 ^graph::dijkstra_shortest_paths.*compressed_graph use_indexed_dary_heap +dijkstra_csr_idx8:2 0x320 ^graph::dijkstra_shortest_paths.*compressed_graph use_indexed_dary_heap + +# --- BGL Dijkstra equivalent for direct comparison --- +dijkstra_bgl_csr 0x800 dijkstra_shortest_paths_no_color_map_no_init compressed_sparse_row \ No newline at end of file diff --git a/agents/dary_heap/perf_capture_manifest_linux.txt b/agents/dary_heap/perf_capture_manifest_linux.txt new file mode 100644 index 0000000..0cb8a81 --- /dev/null +++ b/agents/dary_heap/perf_capture_manifest_linux.txt @@ -0,0 +1,33 @@ +# Linux/GCC capture manifest, parallels agents/perf_capture_manifest.txt. +# +# Differences from the MSVC manifest: +# - Demangled-name shape is the GCC/Itanium one, not MSVC. +# - GCC uses `Nul` (unsigned long) for non-type template args, not `Nu`. +# - GCC inlines the sift_down_/sift_up_ helpers AND BGL's +# preserve_heap_property_down/up entirely into the dijkstra body. +# There are no standalone bodies for those, so this manifest captures +# the enclosing dijkstra body instead — that is where the inlined +# instructions actually live, and that is what is directly comparable +# across toolchains. +# - graph-v3's dijkstra body lives inside an inner `{lambda(auto:1&)#1}:: +# operator()` (the heap-using closure) — that closure is the body +# comparable to BGL's `dijkstra_shortest_paths_no_color_map_no_init`, +# which under GCC is fully inlined into `graph::benchmark::run_bgl_dijkstra`. +# - Sizes from `nm --print-size` are honoured; the length here is a fallback. +# +# Run via: +# python3 scripts/perf/objdump_capture.py \ +# --exe build/linux-gcc-release/benchmark/algorithms/benchmark_dijkstra \ +# --manifest agents/perf_capture_manifest_linux.txt \ +# --out-dir artifacts/perf/linux_gcc + +# --- graph-v3 dijkstra bodies (heap is fully inlined into these) --- +dijkstra_csr_idx2 0x800 use_indexed_dary_heap<2ul> operator() compressed_graph +dijkstra_csr_idx4 0x800 use_indexed_dary_heap<4ul> operator() compressed_graph +dijkstra_csr_idx8 0x800 use_indexed_dary_heap<8ul> operator() compressed_graph +dijkstra_vov_idx4 0x800 use_indexed_dary_heap<4ul> operator() dynamic_graph + +# --- BGL dijkstra body (preserve_heap_property_{down,up} are inlined here) --- +dijkstra_bgl_csr 0x900 run_bgl_dijkstra compressed_sparse_row_graph +dijkstra_bgl_adj 0x900 run_bgl_dijkstra adjacency_list + diff --git a/agents/dary_heap/perf_linux_gcc_inventory.md b/agents/dary_heap/perf_linux_gcc_inventory.md new file mode 100644 index 0000000..a1f788a --- /dev/null +++ b/agents/dary_heap/perf_linux_gcc_inventory.md @@ -0,0 +1,123 @@ +# `artifacts/perf/linux_gcc/` — pre-collected Linux GCC reference + +Captured: 2026-04-28, branch `indexed-dary-heap`, host Titania (WSL2) +Build: `linux-gcc-release` preset (g++ 13.x, `-O3 -DNDEBUG`, +`DIJKSTRA_BENCH_BGL=ON`, BGL at `/home/phil/dev_graph/boost`) + +This directory is **gitignored** (it lives under `artifacts/`), but the +file inventory below is committed so a future session — or a Windows +session diff'ing against `msvc_profile/` — knows what artefacts to +expect. Re-generate on Linux/WSL with: + +```bash +cmake --preset linux-gcc-release \ + -DDIJKSTRA_BENCH_BGL=ON \ + -DBGL_INCLUDE_DIR=/home/phil/dev_graph/boost +cmake --build --preset linux-gcc-release -j --target benchmark_dijkstra + +bash scripts/perf/linux_gcc_capture.sh +``` + +The script drives `scripts/perf/bench_run.py` (wall-clock), `perf stat` +(software events only — WSL has no PMU), and +`scripts/perf/objdump_capture.py` (per-symbol asm via the manifest at +`agents/perf_capture_manifest_linux.txt`). + +## WSL-specific constraints + +- Hardware PMU events (`cache-misses`, `LLC-load-misses`, + `L1-dcache-load-misses`, `cycles` for `perf record`) silently fail + or return zero on WSL2. The capture script attempts a + software-only set (`task-clock,context-switches,page-faults, + cpu-migrations,instructions:u,cycles:u`); on this host even those + return non-zero from `perf stat`, so the `perfstat_*` files exist + but should not be relied on. Wall-clock + objdump are the primary + signals on this side. +- Hardware-counter analysis (cache miss rates, frontend stalls, + branch mispredict ratios) was done on Windows under VTune; results + in `artifacts/perf/msvc_profile/{hotspots,callstacks}.csv`. + +## Inventory + +| File | Lines / size | Description | +|------|-------------:|-------------| +| `wallclock_baseline.json` | 96 rows | bench_run.py JSON, 24 benches × 4 aggregates | +| `diff_msvc_vs_gcc.md` | 26 | Cross-toolchain markdown table from `bench_compare.py` | +| `perfstat_*.{stdout,stderr}` | 8 files | `perf stat` software events (PMU N/A; informational only) | +| **graph-v3 algorithm bodies** | | (sift_down_/sift_up_/comparator are *fully inlined* — no separate symbols) | +| `dijkstra_csr_idx2.asm` | 361 | Outer dijkstra closure, Idx2 (heap inlined) | +| `dijkstra_csr_idx4.asm` | 387 | Outer dijkstra closure, Idx4 (heap inlined) | +| `dijkstra_csr_idx8.asm` | 382 | Outer dijkstra closure, Idx8 (heap inlined) | +| `dijkstra_vov_idx4.asm` | 465 | Outer dijkstra closure, VoV Idx4 (heap inlined) | +| **BGL counterparts** | | | +| `dijkstra_bgl_csr.asm` | 412 | run_bgl_dijkstra (full BGL body inlined) | +| `dijkstra_bgl_adj.asm` | 424 | run_bgl_dijkstra (full BGL body inlined) | + +### Symbols that have no standalone body under GCC + +Under `-O3`, GCC inlines all of these into the dijkstra closure they're +called from. The MSVC build does emit standalone bodies; the inlined +GCC instructions are folded into the dijkstra body counts above. + +| Symbol (MSVC name) | MSVC body | GCC | +|-------------------------------------------------|----------:|-----| +| graph-v3 `sift_down_` (per arity) | 184–186 | inlined | +| graph-v3 `sift_up_` | 109 | inlined | +| graph-v3 `container_value_fn::operator()` | 85 | inlined | +| BGL `preserve_heap_property_down` | 299 | inlined | +| BGL `preserve_heap_property_up` | 204 | inlined | + +The absence is itself a real codegen data point — see +`agents/csr_edge_value_perf_plan.md` Phase 2 Linux GCC. + +## Headline observation + +Treating "fully inlined dijkstra body" as the unit of comparison +(MSVC sum = dijkstra body + sift_down_ + sift_up_; GCC = the single +emitted closure body): + +| | graph-v3 Idx4 | BGL CSR | ratio | +|----------------------------|--------------:|--------:|------:| +| MSVC `/O2 /Ob3` (sum) | 499 | 1,008 | 2.0× | +| GCC `-O3` (closure body) | 387 | 412 | 1.06× | + +graph-v3's closure compresses ~22 % under GCC; BGL's compresses ~59 %. +That delta is consistent with the wall-clock observation that on Linux +GCC graph-v3 CSR Idx4 is **+15 % to +40 % slower than BGL CSR**, while +on MSVC it is 34–64 % *faster*. + +## Gap-status verdict (decision tree from `thread_b_linux_runbook.md`) + +> graph-v3 still +30 %+ slower on Grid (the original 4.3a worst case) + +The Phase 4.3a Linux/GCC gap is **intact at HEAD**: + +| Topology | 2025 4.3a Idx4 vs BGL | 2026-04-28 Idx4 vs BGL | +|-----------|-----------------------|------------------------| +| ER_Sparse | +7.7 % | +14.7 % – +21.9 % | +| Grid | +36.5 % | **+36.2 % – +39.9 %** | +| BA | +9.4 % | +6.0 % – +18.8 % | +| Path | +15.0 % | +15.2 % – +15.6 % | + +Phases 3–5 of `csr_edge_value_perf_plan.md` are un-deferred. + +## Manifest reference + +Capture targets are listed in `agents/perf_capture_manifest_linux.txt` +(format: `basename[:N] length_hex regex [substr ...]`). Differences +from the MSVC manifest: + +- GCC mangling uses `Nul` for `unsigned long` non-type template args + (e.g. `use_indexed_dary_heap<4ul>`), not MSVC's `4u`. +- The graph-v3 dijkstra body lives inside an inner closure + `{lambda(auto:1&)#1}::operator()`; the manifest matches it via the + combination of `use_indexed_dary_heap` + `operator()` + + graph-type substring (`compressed_graph` / `dynamic_graph`). +- BGL's body is captured via `run_bgl_dijkstra` + + `compressed_sparse_row_graph` / `adjacency_list`. The + `dijkstra_shortest_paths_no_color_map_no_init` regex from the MSVC + manifest matches no symbol on GCC (fully inlined). +- `.cold` partitions exist for several of these. `nm --print-size` + reports only the hot partition; the cold partition is at a lower + address (see `BM_Dijkstra_*_Idx4(...) [clone .cold]` symbols) and is + not currently captured. diff --git a/agents/dary_heap/perf_msvc_profile_inventory.md b/agents/dary_heap/perf_msvc_profile_inventory.md new file mode 100644 index 0000000..e7fbf7d --- /dev/null +++ b/agents/dary_heap/perf_msvc_profile_inventory.md @@ -0,0 +1,71 @@ +# `artifacts/perf/msvc_profile/` — pre-collected MSVC reference + +Captured: 2026-04-27, branch `indexed-dary-heap`, host Titania +Build: `windows-msvc-profile` preset (`/O2 /Ob3 /Zi /DNDEBUG`, `/DEBUG` +linker, `DIJKSTRA_BENCH_BGL=ON`, BGL at `D:/dev_graph/boost`) + +This directory is **gitignored** (it lives under `artifacts/`), but the +file inventory below is committed so a Linux/GCC session knows what +artefacts to compare against. Re-generate on Windows with: + +```pwsh +# Wall-clock baseline (2-3 min) +python scripts/perf/bench_run.py ` + --exe build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe ` + --filter "BM_Dijkstra_(CSR|BGL_CSR)_(ER_Sparse|Grid|BA|Path)(_Idx4)?/(10000|100000)$" ` + --reps 5 --min-time 2s --label "windows-msvc-profile" ` + --out artifacts/perf/msvc_profile/wallclock_baseline.json + +# Symbol disasm captures (~1 min after symbol-index cold call) +python scripts/perf/capture_asm.py ` + --exe build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe ` + --manifest agents/perf_capture_manifest.txt ` + --out-dir artifacts/perf/msvc_profile + +# VTune hotspots + callstacks (~30s collect + 5s report) +& $vtune -collect hotspots -knob sampling-mode=sw ` + -result-dir vtune/hot_grid_idx4_profile_001 -- ` + build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe ` + --benchmark_filter="BM_Dijkstra_CSR_Grid_Idx4/100000" --benchmark_min_time=15s +& $vtune -report hotspots -r vtune/hot_grid_idx4_profile_001 -format csv ` + > artifacts/perf/msvc_profile/hotspots.csv +& $vtune -report callstacks -r vtune/hot_grid_idx4_profile_001 -format csv ` + > artifacts/perf/msvc_profile/callstacks.csv +``` + +## Inventory + +| File | Lines / size | Description | +|------|-------------:|-------------| +| `wallclock_baseline.json` | 96 rows | bench_run.py JSON for 24 benchmarks × 4 aggregates | +| `hotspots.csv` | 38 KB | VTune software-mode hotspots, function-level CPU time | +| `callstacks.csv` | 597 KB | VTune callstack tree | +| **graph-v3 heap** | | | +| `sift_down_csr_idx2.asm` | 186 | Idx2 heap sift-down body | +| `sift_down_csr_idx4.asm` | 184 | Idx4 heap sift-down body | +| `sift_down_csr_idx8.asm` | 186 | Idx8 heap sift-down body | +| `sift_down_vov_idx4.asm` | 191 | VoV Idx4 sift-down (control) | +| `sift_up_csr_idx4.asm` | 109 | Idx4 sift-up body | +| **graph-v3 algorithm** | | | +| `dijkstra_csr_idx2.asm` | 206 | Outer Dijkstra body, Idx2 | +| `dijkstra_csr_idx4.asm` | 206 | Outer Dijkstra body, Idx4 | +| `dijkstra_csr_idx8.asm` | 206 | Outer Dijkstra body, Idx8 | +| `container_value_fn.asm` | 85 | edge_value adapter | +| **BGL counterparts** | | | +| `bgl_dary_sift_down_csr.asm` | 299 | preserve_heap_property_down (CSR) | +| `bgl_dary_sift_up_csr.asm` | 204 | preserve_heap_property_up (CSR) | +| `dijkstra_bgl_csr.asm` | 505 | dijkstra_shortest_paths_no_color_map_no_init (CSR) | + +## Headline observation + +Line counts are a proxy for instruction count when comparing functions +compiled at the same `/O2` level on the same toolchain. On MSVC: + +| | graph-v3 | BGL | ratio | +|-----------------------|---------:|----:|------:| +| Dijkstra body | 206 | 505 | 2.5× | +| sift_down_ | 184 | 299 | 1.6× | + +This is consistent with graph-v3 being **34-64 % faster than BGL** on +MSVC profile (Phase 1.1 wall-clock data). The Linux side needs to confirm +or refute the same ratio under GCC. diff --git a/agents/dary_heap/thread_b_linux_runbook.md b/agents/dary_heap/thread_b_linux_runbook.md new file mode 100644 index 0000000..4669eb5 --- /dev/null +++ b/agents/dary_heap/thread_b_linux_runbook.md @@ -0,0 +1,144 @@ +# Thread B — Linux/GCC investigation runbook + +This runbook closes Thread B of `agents/csr_edge_value_perf_plan.md`. + +The MSVC side (Phases 1.1, 2) is complete and committed. The original +Phase 4.3a "graph-v3 is +7 % to +37 % slower than BGL" gap was measured on +**Linux/GCC** and has not been reproduced under MSVC `/O2 /Ob3`. We need +to know whether the gap still exists on Linux/GCC at the current branch +HEAD before deciding whether Phases 3-5 (interventions) should run. + +## Constraint: WSL has no PMU + +WSL2 does not expose the host's hardware performance counters, so: + +- ❌ No `perf stat -e cache-misses,L1-dcache-load-misses,LLC-load-misses` +- ❌ No `perf record -e cycles -F 4000` +- ❌ No Linux equivalent of VTune microarchitecture exploration +- ✅ Wall-clock benchmarks work fine +- ✅ `perf stat` for software events (`task-clock`, `instructions:u`, + `context-switches`, etc.) — covered by the helper script +- ✅ `objdump --demangle` — full disassembly comparison against MSVC + +Everything that *does* need PMU has been pre-collected on the Windows +side and lives in `artifacts/perf/msvc_profile/`. The Linux side compares +against it directly. + +## Pre-collected MSVC reference (`artifacts/perf/msvc_profile/`) + +| File | What it gives the Linux comparison | +|---|---| +| `wallclock_baseline.json` | 96 rows (24 benchmarks × 4 aggregates) on `windows-msvc-profile`. Run the same filter on Linux and use `bench_compare.py` to diff. | +| `hotspots.csv` | VTune software-mode hotspots top-N (function-level CPU time). Linux reproduces this with `perf record --call-graph=fp -F 999` (no PMU needed). | +| `callstacks.csv` | VTune callstack tree. Linux gets the same shape from `perf script` after a software-event `perf record`. | +| `sift_down_csr_idx{2,4,8}.asm` | Per-arity inlined heap-sift body. Each is ~190 lines, 5-insn-per-comparison shape. | +| `sift_down_vov_idx4.asm` | VoV variant for control. | +| `sift_up_csr_idx4.asm` | sift_up_ counterpart (~110 lines). | +| `dijkstra_csr_idx{2,4,8}.asm` | The actual Dijkstra-with-relax-loop body, ~206 lines each. | +| `dijkstra_bgl_csr.asm` | BGL's `dijkstra_shortest_paths_no_color_map_no_init` for `compressed_sparse_row_graph`. **505 lines on MSVC.** Compare line counts and per-edge instruction count vs graph-v3's 206. | +| `bgl_dary_sift_down_csr.asm` | BGL's `preserve_heap_property_down`, ~299 lines. Compare against graph-v3's `sift_down_csr_idx4.asm` (184 lines). | +| `bgl_dary_sift_up_csr.asm` | BGL's `preserve_heap_property_up`, ~204 lines. | +| `container_value_fn.asm` | graph-v3's value-function adapter. | + +### The size signal + +MSVC line counts at `/O2 /Ob3 /Zi`: + +``` +graph-v3 dijkstra body (Idx4) 206 lines +BGL dijkstra body (CSR) 505 lines (~2.5x) + +graph-v3 sift_down_ (Idx4) 184 lines +BGL preserve_heap_property_down 299 lines (~1.6x) +``` + +This is consistent with graph-v3 being 34-64 % faster than BGL on MSVC +(measured in Phase 1.1). The Linux question is whether GCC produces a +similar size ratio (in which case Linux/GCC should also see graph-v3 +ahead) or whether GCC compresses BGL more aggressively (which would +explain the original 4.3a gap). + +## Setup (WSL) + +```bash +# 1. Configure & build the Linux release preset +cmake --preset linux-gcc-release \ + -DDIJKSTRA_BENCH_BGL=ON \ + -DBGL_INCLUDE_DIR=/path/to/boost +cmake --build --preset linux-gcc-release -j + +# 2. Verify the benchmark binary exists +file build/linux-gcc-release/benchmark/algorithms/benchmark_dijkstra +``` + +If `linux-gcc-release` doesn't exist as a preset on this branch, either +add one mirroring `windows-msvc-release` or build manually: + +```bash +mkdir -p build/linux-gcc-release && cd build/linux-gcc-release +cmake -G Ninja -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_BENCHMARKS=ON \ + -DDIJKSTRA_BENCH_BGL=ON -DBGL_INCLUDE_DIR=/path/to/boost \ + ../.. +ninja benchmark_dijkstra +``` + +## Capture + +```bash +# Drives steps 1, 2, 3 below in order. +bash scripts/perf/linux_gcc_capture.sh +``` + +The script does: + +1. **Wall-clock baseline** (`bench_run.py`, 5 reps median, taskset core 4). + Output: `artifacts/perf/linux_gcc/wallclock_baseline.json`. +2. **`perf stat` software events** for the 4 canonical 100K benchmarks. + Output: `artifacts/perf/linux_gcc/perfstat_*.{stdout,stderr}`. +3. **`objdump` per-symbol captures** mirroring the MSVC manifest. + Output: `artifacts/perf/linux_gcc/*.asm`. + +## Compare + +```bash +# Toolchain wall-clock comparison +python3 scripts/perf/bench_compare.py \ + --baseline artifacts/perf/msvc_profile/wallclock_baseline.json \ + --candidate artifacts/perf/linux_gcc/wallclock_baseline.json \ + --label-baseline msvc --label-candidate gcc \ + --threshold 5 + +# Per-symbol size diff (one-liner) +for f in artifacts/perf/msvc_profile/*.asm; do + base=$(basename "$f" .asm) + ml=$(wc -l < "$f") + gl=$(wc -l < "artifacts/perf/linux_gcc/$base.asm" 2>/dev/null || echo NA) + printf "%-30s msvc=%4s gcc=%4s\n" "$base" "$ml" "$gl" +done +``` + +## Decision tree + +| Outcome | Verdict | +|---------|---------| +| graph-v3 wins on Linux too (≤ 0 % delta vs BGL) | Plan **closed**. Update `csr_edge_value_perf_plan.md` Phase 5. The Phase 4.3a gap was closed by the post-4.3a commits (5085c60, 7645a19, 1c871a8, aa95fe0). | +| graph-v3 still slower than BGL on Linux (+5 % or more) | Original investigation **resumes**: run Phase 1.2 (perf-stat counters — software-only on WSL), Phase 1.3 (perf record), Phase 2 (objdump diff between graph-v3 and BGL). The pre-collected MSVC asm gives the codegen reference for what "tight" looks like on the comparable workload. | +| Linux numbers are noisy (CV > 10 %) | Re-run with `--reps 9` and a quieter system. WSL on a busy Windows host is the most likely cause; pin to a single core (`taskset -c 0`) and disable Windows background services. | + +## Files added by this work + +| Path | Purpose | +|------|---------| +| `scripts/perf/sym_index.py` | Cached dumpbin symbol index (Windows) | +| `scripts/perf/disasm_func.py` | Single-function disasm (Windows) | +| `scripts/perf/find_func.py` | Symbol search (Windows) | +| `scripts/perf/capture_asm.py` | Bulk dumpbin capture (Windows) | +| `scripts/perf/objdump_capture.py` | Bulk objdump capture (Linux) | +| `scripts/perf/linux_gcc_capture.sh` | Linux runbook driver | +| `scripts/perf/bench_run.py` | Cross-platform Google Benchmark wrapper | +| `scripts/perf/bench_compare.py` | JSON-diff into markdown | +| `scripts/perf/vtune_top.py` | VTune CSV parser | +| `agents/perf_capture_manifest.txt` | MSVC capture targets | +| `agents/perf_capture_manifest_linux.txt` | GCC capture targets | +| `artifacts/perf/msvc_profile/` | Pre-collected MSVC reference (gitignored) | diff --git a/agents/indexed_dary_heap_plan.md b/agents/indexed_dary_heap_plan.md deleted file mode 100644 index 53a64d3..0000000 --- a/agents/indexed_dary_heap_plan.md +++ /dev/null @@ -1,319 +0,0 @@ -# Indexed d-ary Heap for Dijkstra & Prim — Plan - -This plan introduces a true decrease-key priority queue to replace the -`std::priority_queue` lazy-deletion pattern currently used by Dijkstra -(and likely useful for Prim's MST). The goal is to remove stale-pop -overhead, reduce heap memory from O(E) to O(V), and bring visitor -semantics in line with BGL. - -**Branch:** `indexed-dary-heap` - -**Invariant:** After every phase, `ctest` passes all existing tests. No -phase may break the public API of `dijkstra_shortest_paths`, -`dijkstra_shortest_distances`, or any algorithm that already uses -`std::priority_queue` internally. - ---- - -## Conventions - -| Symbol | Meaning | -|--------|---------| -| **File** | Absolute path relative to repo root | -| **Read** | Files the agent must read for context before editing | -| **Create** | New files to create | -| **Modify** | Existing files to edit | -| **Verify** | Commands to run and expected outcomes | -| **Commit** | Git commit message (conventional-commit style) | - ---- - -## Background - -### Current state - -`dijkstra_shortest_paths` uses `std::priority_queue` -with re-insertion when a vertex's distance improves. The recently -added stale-pop skip: - -```cpp -if (compare(distance(g, uid), w)) continue; -``` - -makes this correct and gives single-shot visitor semantics, but the -heap can hold up to O(E) entries and every relaxed edge causes a -push. - -### Target state - -A min-heap that: - -- Stores at most one entry per vertex (size ≤ V). -- Supports `push`, `top`, `pop`, `decrease(vid)`, `contains(vid)`. -- Looks up a vertex's current distance via the user-supplied - `DistanceFn` (so heap order tracks live distance). -- Is parameterized on arity `d` (default `d = 4`, matching Boost's - `d_ary_heap_indirect`). -- Uses an external position map (`vertex_id -> heap_index`) so that - `decrease` is O(log_d V). - -### Performance hypothesis - -| Workload | Expected change vs. current | -|----------|-----------------------------| -| Sparse graph, few re-relaxations | Small win (push count drops, log V vs log E) | -| Dense graph, many re-relaxations | Large win (heap size O(V) vs O(E)) | -| Mapped (associative) vertex containers | Win depends on position-map cost | - -Hypothesis must be confirmed by benchmarks (Phase 4) before declaring -the new heap the default. - ---- - -## Phase 0 — Preparation (no code changes) - -### 0.1 Verify Baseline - -| Item | Detail | -|------|--------| -| **Action** | Confirm the full test suite is green on the branch base. | -| **Verify** | `cd build/linux-gcc-debug && ctest --output-on-failure` — all tests pass | - -### 0.2 Capture Baseline Benchmarks - -| Item | Detail | -|------|--------| -| **Action** | Record current Dijkstra benchmark numbers. | -| **Read** | `benchmark/algorithms/` for existing Dijkstra benchmarks | -| **Verify** | Save numbers to `agents/indexed_dary_heap_baseline.md` (gitignored or committed as reference). If no Dijkstra benchmark exists, create one in 0.3. | - -### 0.3 Add Dijkstra Benchmark (if missing) - -| Item | Detail | -|------|--------| -| **Action** | Ensure a Google Benchmark target exercises Dijkstra over (a) sparse random graph, (b) dense random graph, (c) grid graph, each at multiple V sizes. | -| **Create** | `benchmark/algorithms/benchmark_dijkstra.cpp` if not present | -| **Verify** | Benchmark builds and produces stable numbers across runs (CV < 5%). | - ---- - -## Phase 1 — Indexed d-ary Heap Container - -### 1.1 Design Header - -| Item | Detail | -|------|--------| -| **Read** | `boost/libs/graph/include/boost/graph/detail/d_ary_heap.hpp` for reference | -| **Create** | `include/graph/detail/indexed_dary_heap.hpp` | - -Sketch of the public interface: - -```cpp -namespace graph::detail { - -// External-key, indirect-comparison d-ary heap. -// -// Key : the user's vertex id type (must be usable as an index/lookup key) -// DistanceFn: callable (key) -> Distance& (or const Distance&) -// Compare : strict weak order over Distance values (min-heap if less<>) -// PositionMap: random-access mapping key -> size_t (heap position) or NPOS -// Arity : children per node (default 4) -template < - class Key, - class DistanceFn, - class Compare, - class PositionMap, - std::size_t Arity = 4, - class Allocator = std::allocator> -class indexed_dary_heap { -public: - static constexpr std::size_t npos = static_cast(-1); - - indexed_dary_heap(DistanceFn d, Compare c, PositionMap p, const Allocator& = {}); - - bool empty() const noexcept; - size_t size() const noexcept; - - void push(Key k); // O(log_d N) - Key top() const; // O(1) - void pop(); // O(d log_d N) - void decrease(Key k); // O(log_d N) — distance must already be lower - bool contains(Key k) const; // O(1) - void clear(); - -private: - std::vector heap_; - DistanceFn distance_; - Compare compare_; - PositionMap position_; // heap stores positions back into here on every move - - void sift_up_(size_t i); - void sift_down_(size_t i); - void place_(size_t i, Key k); // writes heap_[i] = k AND position_[k] = i -}; - -} // namespace graph::detail -``` - -Notes: -- `PositionMap` is a *concept-style* requirement: `size_t& operator()(Key)` or - similar. For index-based graphs, it can wrap a `std::vector`. For - mapped graphs, it can wrap an `std::unordered_map`. Decision deferred to - 1.3. -- `DistanceFn` is the *same* function the user passes to - `dijkstra_shortest_paths`. The heap reads, never writes. -- Comparator is `Compare`, applied to *distances* (not keys). Internally: - `compare_(distance_(a), distance_(b))`. - -### 1.2 Implement Core Operations - -| Item | Detail | -|------|--------| -| **Action** | Implement `push`, `pop`, `sift_up_`, `sift_down_`, `decrease`, `contains`, `clear`. Keep `place_` as the single point where positions are written, to avoid bookkeeping bugs. | -| **Verify** | Unit-tests in 1.4 pass. | - -Key correctness rules: -- Every assignment to `heap_[i]` must go through `place_` so `position_` stays in sync. -- `decrease(k)` reads `position_(k)` then sifts up only — caller guarantees the new distance is no worse. -- `pop()` swaps last → root, marks the popped key's position as `npos`, then sifts down. - -### 1.3 Position Map Adapter - -| Item | Detail | -|------|--------| -| **Create** | `include/graph/detail/heap_position_map.hpp` | -| **Action** | Provide two adapters:
1. `vector_position_map` — wraps a `std::vector` indexed by integral key.
2. `assoc_position_map` — wraps `std::unordered_map` for non-integral keys.
Both default-construct to `npos` semantics. | -| **Verify** | Adapters compile with the heap. Covered by tests in 1.4. | - -### 1.4 Unit Tests - -| Item | Detail | -|------|--------| -| **Create** | `tests/common/test_indexed_dary_heap.cpp` | -| **Action** | Cover: empty heap, single element, ascending/descending pushes, mixed push+pop, repeated `decrease`, `contains` before/after push/pop, both arity 2 and 4, custom comparator (max-heap), both position-map adapters. | -| **Verify** | `ctest -R indexed_dary_heap` — all pass. | -| **Commit** | `feat(detail): indexed d-ary heap with external position map` | - ---- - -## Phase 2 — Integrate into Dijkstra (opt-in) - -### 2.1 Add Heap-Selector Tag (or Template Parameter) - -| Item | Detail | -|------|--------| -| **Read** | `include/graph/algorithm/dijkstra_shortest_paths.hpp` | -| **Modify** | Add an optional template parameter `Heap = use_default_heap` (a tag). When `use_default_heap`, behavior is unchanged. When `use_indexed_dary_heap`, the new heap is used. | -| **Verify** | Existing tests still pass (default path unchanged). | -| **Commit** | `feat(dijkstra): add heap-selector template parameter` | - -Rationale: keeps the change additive and reversible. We can flip the -default in a later phase once benchmarks confirm parity or improvement. - -### 2.2 Implementation Branch - -| Item | Detail | -|------|--------| -| **Modify** | Inside `dijkstra_shortest_paths`, dispatch to one of two inner implementations based on the `Heap` tag. Share the visitor / relax / source-seeding code via a small helper. | -| **Action** | The indexed-heap implementation:
- Removes the stale-pop skip (no stale entries possible).
- Replaces re-push with `decrease` on the relax path.
- Removes `weighted_vertex` (heap stores ids only; distance is read live via `DistanceFn`). | -| **Verify** | All existing Dijkstra tests pass under both code paths. Add a test variant that exercises each test with the indexed heap. | -| **Commit** | `feat(dijkstra): indexed d-ary heap implementation path` | - -### 2.3 Visitor Semantics Audit - -| Item | Detail | -|------|--------| -| **Action** | Confirm `on_examine_vertex` and `on_finish_vertex` fire exactly once per reachable vertex on the indexed-heap path. Confirm `on_edge_relaxed` and `on_edge_not_relaxed` counts match Boost's behavior. | -| **Verify** | Add a counting-visitor test that asserts call counts on a reference graph with both heap paths. | -| **Commit** | `test(dijkstra): visitor call-count parity across heap paths` | - ---- - -## Phase 3 — Mapped-Container Support - -### 3.1 Position Map for Mapped Graphs - -| Item | Detail | -|------|--------| -| **Read** | `agents/map_container_strategy.md`, `agents/map_container_plan.md` | -| **Action** | Wire the `assoc_position_map` adapter into the indexed-heap dispatch when `vertex_id_t` is non-integral or the graph is a mapped container. Decision criterion to be documented. | -| **Verify** | Run the Dijkstra test suite against mapped graph types with the indexed heap. | -| **Commit** | `feat(dijkstra): indexed-heap support for mapped containers` | - -### 3.2 Vertex-Property-Map Position Storage (optional) - -| Item | Detail | -|------|--------| -| **Action** | Investigate whether the position map can live inside the graph as a vertex property map (matching Boost's `vertex_property_map_generator`). Spike only — implement only if it removes a meaningful allocation on hot paths. | -| **Verify** | Benchmark before/after on mapped graphs. | -| **Commit** | `feat(dijkstra): in-graph position map for mapped containers` (only if accepted) | - ---- - -## Phase 4 — Benchmarks & Default Selection - -### 4.1 Comparative Benchmarks - -| Item | Detail | -|------|--------| -| **Action** | Run the Phase 0.3 benchmarks against (a) `priority_queue` path, (b) `indexed_dary_heap<2>`, (c) `indexed_dary_heap<4>`, (d) `indexed_dary_heap<8>`. Record results in `agents/indexed_dary_heap_results.md`. | -| **Verify** | Numbers stable across at least 3 runs. | - -### 4.2 Decide Default - -| Item | Detail | -|------|--------| -| **Action** | Based on results:
- If indexed `d=4` wins or ties on every workload, make it the default.
- If it loses on sparse small graphs, keep `priority_queue` default and document the selector.
- If results are mixed, consider a heuristic dispatch (e.g., based on E/V ratio) — but only with strong evidence. | -| **Modify** | Default heap parameter, plus a CHANGELOG entry. | -| **Verify** | Full test suite still green. Benchmarks regenerated. | -| **Commit** | `perf(dijkstra): switch default heap to indexed d-ary` (or document why not) | - ---- - -## Phase 5 — Reuse for Prim's MST (optional follow-up) - -### 5.1 Audit Prim's Implementation - -| Item | Detail | -|------|--------| -| **Read** | `include/graph/algorithm/mst.hpp` (or wherever Prim lives) | -| **Action** | Identify whether Prim has the same lazy-deletion pattern. If yes, plan a parallel migration. | -| **Verify** | N/A (planning only). | - -### 5.2 Apply Indexed Heap to Prim - -| Item | Detail | -|------|--------| -| **Action** | Mirror Phase 2 for Prim: opt-in selector → integrate → benchmark → switch default. | -| **Verify** | MST test suite green. | -| **Commit** | `perf(mst): indexed d-ary heap path for Prim` | - ---- - -## Open Questions - -1. **PositionMap ownership.** Owned by the heap (simplest, allocates per call), or - passed in (zero-allocation for repeated calls, more API surface)? Default to - owned-by-heap for the first cut. -2. **Arity as runtime vs compile-time.** Compile-time only — runtime would lose - the constexpr unrolling that justifies d-ary heaps in the first place. -3. **`Compare` indirection cost.** The heap calls `compare_(distance_(a), distance_(b))` - twice per sift-down step (one comparator call per child + one against the parent). - For trivial `DistanceFn` (vector lookup) this should inline; verify in benchmarks. -4. **Visitor `on_examine_vertex` semantics on multi-source seeding.** The current - multi-source code seeds N vertices into the queue. With the indexed heap, the - first pop of each source is the settled pop (no re-pushes possible since - distance is already 0). Confirm visitor semantics are unchanged. -5. **Should the new heap live in `graph/detail/` or be promoted to `graph/container/`?** - Defer the decision — start in `detail/` and promote only if external code finds it - useful. - ---- - -## Out of Scope - -- Fibonacci heap, pairing heap, or radix heap implementations. -- Replacing other algorithms' priority queues (BFS variants, A*, etc.). -- Changing public algorithm signatures beyond adding the optional `Heap` template - parameter. -- Parallel / concurrent heap variants. diff --git a/benchmark/algorithms/CMakeLists.txt b/benchmark/algorithms/CMakeLists.txt index 0ed3c32..8883b0f 100644 --- a/benchmark/algorithms/CMakeLists.txt +++ b/benchmark/algorithms/CMakeLists.txt @@ -1,20 +1,111 @@ # Algorithm Benchmarks CMakeLists.txt # Performance benchmarks for graph algorithms -# Benchmark executables will be added here as algorithms are implemented -# Example: +# --------------------------------------------------------------------------- +# Dijkstra benchmark (Phase 0 — baseline capture) +# --------------------------------------------------------------------------- + +add_executable(benchmark_dijkstra + benchmark_dijkstra.cpp +) + +target_link_libraries(benchmark_dijkstra + PRIVATE + graph::graph3 + benchmark::benchmark +) + +target_include_directories(benchmark_dijkstra + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} +) + +# Optional large-scale tier (V = 1 000 000): cmake -DDIJKSTRA_BENCH_LARGE=ON +option(DIJKSTRA_BENCH_LARGE "Enable V=1000000 benchmark tier" OFF) +if(DIJKSTRA_BENCH_LARGE) + target_compile_definitions(benchmark_dijkstra PRIVATE DIJKSTRA_BENCH_LARGE) +endif() + +# Optional Boost.Graph (BGL) comparison benchmarks (Phase 4.3). # -# add_executable(benchmark_shortest_path -# benchmark_shortest_path.cpp -# ) +# BGL is header-only, so we only need an include directory containing +# . No linking required. # -# target_link_libraries(benchmark_shortest_path -# PRIVATE -# graph::graph3 -# benchmark::benchmark -# ) +# Resolution order for the include directory (first match wins): +# 1. Explicit cache var: cmake -DBGL_INCLUDE_DIR=/path/to/boost +# 2. Environment variable: $env:BGL_INCLUDE_DIR or $BGL_INCLUDE_DIR +# 3. Environment variable: $env:BOOST_ROOT or $BOOST_ROOT +# 4. Per-platform defaults (see DIJKSTRA_BENCH_BGL_DEFAULT_PATHS below) # -# add_test(NAME benchmark_shortest_path -# COMMAND benchmark_shortest_path --benchmark_min_time=0.1) +# Typical invocations: +# cmake -DDIJKSTRA_BENCH_BGL=ON # auto-discover +# cmake -DDIJKSTRA_BENCH_BGL=ON -DBGL_INCLUDE_DIR=D:/dev_graph/boost +# $env:BGL_INCLUDE_DIR = "D:/dev_graph/boost"; cmake -DDIJKSTRA_BENCH_BGL=ON +option(DIJKSTRA_BENCH_BGL "Enable BGL comparison benchmarks (auto-discovers Boost; override with BGL_INCLUDE_DIR)" OFF) +set(BGL_INCLUDE_DIR "" CACHE PATH "Boost include directory for BGL benchmarks (overrides auto-discovery)") + +# Per-platform default search paths. Customise for new environments by either +# setting BGL_INCLUDE_DIR / BOOST_ROOT in the environment or by appending here. +if(WIN32) + set(DIJKSTRA_BENCH_BGL_DEFAULT_PATHS + "D:/dev_graph/boost" + "C:/dev_graph/boost" + "C:/boost" + "C:/local/boost" + ) +elseif(APPLE) + set(DIJKSTRA_BENCH_BGL_DEFAULT_PATHS + "$ENV{HOME}/dev_graph/boost" + "/opt/homebrew/include" + "/usr/local/include" + ) +else() # Linux / other Unix + set(DIJKSTRA_BENCH_BGL_DEFAULT_PATHS + "$ENV{HOME}/dev_graph/boost" + "/usr/include" + "/usr/local/include" + ) +endif() + +if(DIJKSTRA_BENCH_BGL) + set(_bgl_resolved "") + set(_bgl_marker "boost/graph/dijkstra_shortest_paths.hpp") + + # Build candidate list in priority order. + set(_bgl_candidates "") + if(BGL_INCLUDE_DIR) + list(APPEND _bgl_candidates "${BGL_INCLUDE_DIR}") + endif() + if(DEFINED ENV{BGL_INCLUDE_DIR} AND NOT "$ENV{BGL_INCLUDE_DIR}" STREQUAL "") + list(APPEND _bgl_candidates "$ENV{BGL_INCLUDE_DIR}") + endif() + if(DEFINED ENV{BOOST_ROOT} AND NOT "$ENV{BOOST_ROOT}" STREQUAL "") + list(APPEND _bgl_candidates "$ENV{BOOST_ROOT}") + endif() + list(APPEND _bgl_candidates ${DIJKSTRA_BENCH_BGL_DEFAULT_PATHS}) + + foreach(_cand IN LISTS _bgl_candidates) + if(_cand AND EXISTS "${_cand}/${_bgl_marker}") + set(_bgl_resolved "${_cand}") + break() + endif() + endforeach() + + if(NOT _bgl_resolved) + message(FATAL_ERROR + "DIJKSTRA_BENCH_BGL=ON but no Boost include directory was found containing " + "${_bgl_marker}.\n" + "Tried (in order): ${_bgl_candidates}\n" + "Set -DBGL_INCLUDE_DIR=/path/to/boost, $env:BGL_INCLUDE_DIR, or $env:BOOST_ROOT, " + "or append to DIJKSTRA_BENCH_BGL_DEFAULT_PATHS in benchmark/algorithms/CMakeLists.txt.") + endif() + + target_include_directories(benchmark_dijkstra SYSTEM PRIVATE "${_bgl_resolved}") + target_compile_definitions(benchmark_dijkstra PRIVATE BENCH_BGL) + message(STATUS "BGL comparison benchmarks enabled (using ${_bgl_resolved})") +endif() -# Note: Uncomment and add benchmark executables as algorithms are implemented +# Register with CTest using a short minimum time so CI stays fast. +# For proper baseline capture use: ./benchmark_dijkstra --benchmark_min_time=1.0 +add_test(NAME benchmark_dijkstra + COMMAND benchmark_dijkstra --benchmark_min_time=0.1) diff --git a/benchmark/algorithms/benchmark_dijkstra.cpp b/benchmark/algorithms/benchmark_dijkstra.cpp new file mode 100644 index 0000000..c44a7f5 --- /dev/null +++ b/benchmark/algorithms/benchmark_dijkstra.cpp @@ -0,0 +1,346 @@ +/** + * @file benchmark_dijkstra.cpp + * @brief Google Benchmark suite for dijkstra_shortest_distances. + * + * Covers four graph topologies × two containers (CSR and vov) across a + * scale sweep of V ∈ {1 000, 10 000, 100 000}. Graph construction and + * distance-vector initialisation are excluded from the timed region via + * state.PauseTiming() / state.ResumeTiming() — only the Dijkstra call + * itself is measured. + * + * Benchmark naming convention: + * BM_Dijkstra__ — default heap (priority_queue) + * BM_Dijkstra___Idx — indexed d-ary heap, arity D + * Container : CSR (compressed_graph) + * VoV (dynamic_graph / vov) + * Topology : ER_Sparse Erdős–Rényi, E/V ≈ 8 + * Grid 2D grid (bidirectional, E/V ≈ 4) + * BA Barabási–Albert, m=4, E/V ≈ 8 + * Path Path graph, E/V = 1 (minimum decrease-key) + * + * Compile-time macro DIJKSTRA_BENCH_LARGE enables the 1 000 000-vertex + * tier (disabled by default to keep CI times reasonable). + * + * Phase 0.4 baseline results: agents/indexed_dary_heap_baseline.md + * Phase 4.1 comparative results: agents/indexed_dary_heap_results.md + */ + +#include + +#include +#include +#include + +#include "dijkstra_fixtures.hpp" + +#ifdef BENCH_BGL +# include "bgl_dijkstra_fixtures.hpp" +#endif + +#include +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +namespace { + +/// Resize and initialise the distance vector to +∞. Called once before +/// the benchmark loop; the per-iteration reset uses std::fill (cheaper +/// than reallocation) inside the paused region. +template +void init_dist(const G& g, std::vector& dist) { + const std::size_t n = graph::num_vertices(g); + dist.assign(n, std::numeric_limits::max()); +} + +/// Weight function: return the edge value stored in the graph. +constexpr auto weight_fn = [](const auto& g, const auto& uv) { + return graph::edge_value(g, uv); +}; + +} // namespace + +// --------------------------------------------------------------------------- +// Macro: define a Dijkstra benchmark for a given container, graph builder, +// and heap tag. +// +// Parameters: +// NAME — benchmark function name (e.g. BM_Dijkstra_CSR_ER_Sparse) +// GRAPH_T — graph container type +// MAKE_FN — graph::benchmark::make_csr or make_vov +// EDGE_EXPR — expression producing edge_list, may use vertex_id_t n +// N_EXPR — expression for num_vertices to pass to MAKE_FN (usually n) +// HEAP_TAG — use_default_heap{} or use_indexed_dary_heap{} +// --------------------------------------------------------------------------- + +#define DEFINE_DIJKSTRA_BM(NAME, GRAPH_T, MAKE_FN, EDGE_EXPR, N_EXPR, HEAP_TAG) \ + static void NAME(benchmark::State& state) { \ + const auto n = static_cast(state.range(0)); \ + /* Build graph outside the timed loop */ \ + const auto edges = (EDGE_EXPR); \ + GRAPH_T g = graph::benchmark::MAKE_FN(edges, (N_EXPR)); \ + std::vector dist; \ + init_dist(g, dist); \ + for (auto _ : state) { \ + /* Exclude distance-reset from the measurement */ \ + state.PauseTiming(); \ + std::fill(dist.begin(), dist.end(), std::numeric_limits::max()); \ + state.ResumeTiming(); \ + graph::dijkstra_shortest_distances( \ + g, graph::benchmark::vertex_id_t{0}, graph::container_value_fn(dist), \ + weight_fn, graph::empty_visitor{}, \ + std::less{}, std::plus{}, \ + HEAP_TAG, std::allocator{}); \ + benchmark::DoNotOptimize(dist.data()); \ + } \ + state.SetComplexityN(state.range(0)); \ + } + +// Convenience shorthands for the four heap variants. +#define DEF_BM_DEFAULT(NAME, GT, MK, EE, NE) DEFINE_DIJKSTRA_BM(NAME, GT, MK, EE, NE, graph::use_default_heap{}) +#define DEF_BM_IDX2(NAME, GT, MK, EE, NE) DEFINE_DIJKSTRA_BM(NAME, GT, MK, EE, NE, graph::use_indexed_dary_heap<2>{}) +#define DEF_BM_IDX4(NAME, GT, MK, EE, NE) DEFINE_DIJKSTRA_BM(NAME, GT, MK, EE, NE, graph::use_indexed_dary_heap<4>{}) +#define DEF_BM_IDX8(NAME, GT, MK, EE, NE) DEFINE_DIJKSTRA_BM(NAME, GT, MK, EE, NE, graph::use_indexed_dary_heap<8>{}) + +// --------------------------------------------------------------------------- +// Erdős–Rényi, E/V ≈ 8 (p = 8/n) +// --------------------------------------------------------------------------- + +#define ER_EDGES(n) graph::benchmark::erdos_renyi(n, 8.0 / n) +#define GRID_SQRT(n) static_cast(std::sqrt(static_cast(n))) + +DEF_BM_DEFAULT(BM_Dijkstra_CSR_ER_Sparse, graph::benchmark::csr_graph_t, make_csr, ER_EDGES(n), n) +DEF_BM_IDX2 (BM_Dijkstra_CSR_ER_Sparse_Idx2, graph::benchmark::csr_graph_t, make_csr, ER_EDGES(n), n) +DEF_BM_IDX4 (BM_Dijkstra_CSR_ER_Sparse_Idx4, graph::benchmark::csr_graph_t, make_csr, ER_EDGES(n), n) +DEF_BM_IDX8 (BM_Dijkstra_CSR_ER_Sparse_Idx8, graph::benchmark::csr_graph_t, make_csr, ER_EDGES(n), n) + +DEF_BM_DEFAULT(BM_Dijkstra_VoV_ER_Sparse, graph::benchmark::vov_graph_t, make_vov, ER_EDGES(n), n) +DEF_BM_IDX4 (BM_Dijkstra_VoV_ER_Sparse_Idx4, graph::benchmark::vov_graph_t, make_vov, ER_EDGES(n), n) + +BENCHMARK(BM_Dijkstra_CSR_ER_Sparse) ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_ER_Sparse_Idx2)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_ER_Sparse_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_ER_Sparse_Idx8)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_VoV_ER_Sparse) ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_VoV_ER_Sparse_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); + +// --------------------------------------------------------------------------- +// 2D grid (rows = cols = sqrt(n), E/V ≈ 4) +// --------------------------------------------------------------------------- + +DEF_BM_DEFAULT(BM_Dijkstra_CSR_Grid, graph::benchmark::csr_graph_t, make_csr, + graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)), GRID_SQRT(n) * GRID_SQRT(n)) +DEF_BM_IDX2 (BM_Dijkstra_CSR_Grid_Idx2, graph::benchmark::csr_graph_t, make_csr, + graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)), GRID_SQRT(n) * GRID_SQRT(n)) +DEF_BM_IDX4 (BM_Dijkstra_CSR_Grid_Idx4, graph::benchmark::csr_graph_t, make_csr, + graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)), GRID_SQRT(n) * GRID_SQRT(n)) +DEF_BM_IDX8 (BM_Dijkstra_CSR_Grid_Idx8, graph::benchmark::csr_graph_t, make_csr, + graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)), GRID_SQRT(n) * GRID_SQRT(n)) + +DEF_BM_DEFAULT(BM_Dijkstra_VoV_Grid, graph::benchmark::vov_graph_t, make_vov, + graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)), GRID_SQRT(n) * GRID_SQRT(n)) +DEF_BM_IDX4 (BM_Dijkstra_VoV_Grid_Idx4, graph::benchmark::vov_graph_t, make_vov, + graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)), GRID_SQRT(n) * GRID_SQRT(n)) + +BENCHMARK(BM_Dijkstra_CSR_Grid) ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_Grid_Idx2)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_Grid_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_Grid_Idx8)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_VoV_Grid) ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_VoV_Grid_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); + +// --------------------------------------------------------------------------- +// Barabási–Albert, m=4 (E/V ≈ 8, heavy hub traffic) +// --------------------------------------------------------------------------- + +DEF_BM_DEFAULT(BM_Dijkstra_CSR_BA, graph::benchmark::csr_graph_t, make_csr, graph::benchmark::barabasi_albert(n, 4), n) +DEF_BM_IDX2 (BM_Dijkstra_CSR_BA_Idx2, graph::benchmark::csr_graph_t, make_csr, graph::benchmark::barabasi_albert(n, 4), n) +DEF_BM_IDX4 (BM_Dijkstra_CSR_BA_Idx4, graph::benchmark::csr_graph_t, make_csr, graph::benchmark::barabasi_albert(n, 4), n) +DEF_BM_IDX8 (BM_Dijkstra_CSR_BA_Idx8, graph::benchmark::csr_graph_t, make_csr, graph::benchmark::barabasi_albert(n, 4), n) + +DEF_BM_DEFAULT(BM_Dijkstra_VoV_BA, graph::benchmark::vov_graph_t, make_vov, graph::benchmark::barabasi_albert(n, 4), n) +DEF_BM_IDX4 (BM_Dijkstra_VoV_BA_Idx4, graph::benchmark::vov_graph_t, make_vov, graph::benchmark::barabasi_albert(n, 4), n) + +BENCHMARK(BM_Dijkstra_CSR_BA) ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_BA_Idx2)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_BA_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_BA_Idx8)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_VoV_BA) ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_VoV_BA_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); + +// --------------------------------------------------------------------------- +// Path graph (E/V = 1, minimum decrease-key) +// --------------------------------------------------------------------------- + +DEF_BM_DEFAULT(BM_Dijkstra_CSR_Path, graph::benchmark::csr_graph_t, make_csr, graph::benchmark::path_graph(n), n) +DEF_BM_IDX2 (BM_Dijkstra_CSR_Path_Idx2, graph::benchmark::csr_graph_t, make_csr, graph::benchmark::path_graph(n), n) +DEF_BM_IDX4 (BM_Dijkstra_CSR_Path_Idx4, graph::benchmark::csr_graph_t, make_csr, graph::benchmark::path_graph(n), n) +DEF_BM_IDX8 (BM_Dijkstra_CSR_Path_Idx8, graph::benchmark::csr_graph_t, make_csr, graph::benchmark::path_graph(n), n) + +DEF_BM_DEFAULT(BM_Dijkstra_VoV_Path, graph::benchmark::vov_graph_t, make_vov, graph::benchmark::path_graph(n), n) +DEF_BM_IDX4 (BM_Dijkstra_VoV_Path_Idx4, graph::benchmark::vov_graph_t, make_vov, graph::benchmark::path_graph(n), n) + +BENCHMARK(BM_Dijkstra_CSR_Path) ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_Path_Idx2)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_Path_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_Path_Idx8)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_VoV_Path) ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_VoV_Path_Idx4)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); + +// --------------------------------------------------------------------------- +// Optional large-scale tier (V = 1 000 000) +// Enable with: cmake -DDIJKSTRA_BENCH_LARGE=ON ... +// --------------------------------------------------------------------------- + +#ifdef DIJKSTRA_BENCH_LARGE +BENCHMARK(BM_Dijkstra_CSR_ER_Sparse) ->Arg(1'000'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_ER_Sparse_Idx4)->Arg(1'000'000)->Complexity(); +BENCHMARK(BM_Dijkstra_VoV_ER_Sparse) ->Arg(1'000'000)->Complexity(); +BENCHMARK(BM_Dijkstra_VoV_ER_Sparse_Idx4)->Arg(1'000'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_BA) ->Arg(1'000'000)->Complexity(); +BENCHMARK(BM_Dijkstra_CSR_BA_Idx4)->Arg(1'000'000)->Complexity(); +#endif + +// --------------------------------------------------------------------------- +// BGL comparison benchmarks (Phase 4.3) +// +// Enabled with -DDIJKSTRA_BENCH_BGL=ON -DBGL_INCLUDE_DIR=/path/to/boost. +// Topologically identical graphs are built from the same edge_list, so the +// BGL and graph-v3 numbers can be compared directly. +// +// BGL uses dijkstra_shortest_paths_no_color_map_no_init for fairness: +// caller pre-initialises distances; no color-map allocation inside the +// timed region (matches graph-v3's no-init semantics). +// --------------------------------------------------------------------------- + +#ifdef BENCH_BGL + +#define DEFINE_BGL_DIJKSTRA_BM(NAME, GRAPH_T, MAKE_FN, EDGE_EXPR, N_EXPR) \ + static void NAME(benchmark::State& state) { \ + const auto n = static_cast(state.range(0)); \ + const auto edges = (EDGE_EXPR); \ + GRAPH_T g = graph::benchmark::MAKE_FN(edges, (N_EXPR)); \ + std::vector dist(boost::num_vertices(g), \ + std::numeric_limits::max()); \ + for (auto _ : state) { \ + state.PauseTiming(); \ + std::fill(dist.begin(), dist.end(), std::numeric_limits::max()); \ + dist[0] = 0.0; \ + state.ResumeTiming(); \ + graph::benchmark::run_bgl_dijkstra(g, 0u, dist); \ + benchmark::DoNotOptimize(dist.data()); \ + } \ + state.SetComplexityN(state.range(0)); \ + } + +DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_CSR_ER_Sparse, + graph::benchmark::bgl_csr_graph_t, make_bgl_csr, + ER_EDGES(n), n) +DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_CSR_Grid, + graph::benchmark::bgl_csr_graph_t, make_bgl_csr, + graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)), + GRID_SQRT(n) * GRID_SQRT(n)) +DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_CSR_BA, + graph::benchmark::bgl_csr_graph_t, make_bgl_csr, + graph::benchmark::barabasi_albert(n, 4), n) +DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_CSR_Path, + graph::benchmark::bgl_csr_graph_t, make_bgl_csr, + graph::benchmark::path_graph(n), n) + +DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_Adj_ER_Sparse, + graph::benchmark::bgl_adj_graph_t, make_bgl_adj, + ER_EDGES(n), n) +DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_Adj_Grid, + graph::benchmark::bgl_adj_graph_t, make_bgl_adj, + graph::benchmark::grid_2d(GRID_SQRT(n), GRID_SQRT(n)), + GRID_SQRT(n) * GRID_SQRT(n)) +DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_Adj_BA, + graph::benchmark::bgl_adj_graph_t, make_bgl_adj, + graph::benchmark::barabasi_albert(n, 4), n) +DEFINE_BGL_DIJKSTRA_BM(BM_Dijkstra_BGL_Adj_Path, + graph::benchmark::bgl_adj_graph_t, make_bgl_adj, + graph::benchmark::path_graph(n), n) + +BENCHMARK(BM_Dijkstra_BGL_CSR_ER_Sparse)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_BGL_CSR_Grid) ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_BGL_CSR_BA) ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_BGL_CSR_Path) ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_BGL_Adj_ER_Sparse)->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_BGL_Adj_Grid) ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_BGL_Adj_BA) ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); +BENCHMARK(BM_Dijkstra_BGL_Adj_Path) ->RangeMultiplier(10)->Range(1'000, 100'000)->Complexity(); + +// --------------------------------------------------------------------------- +// Cross-library distance correctness check. +// +// Runs once at startup before benchmarks begin. For each topology, builds +// the same edge list with both libraries, runs Dijkstra from vertex 0, +// and asserts the resulting distance vectors match. Differences indicate +// a wrapper or property-map bug rather than a real performance signal. +// --------------------------------------------------------------------------- + +namespace { + +bool check_bgl_distance_parity() { + using namespace graph::benchmark; + constexpr vertex_id_t n = 1024; + + auto check = [&](const auto& edges) { + const std::size_t nv = static_cast(n); + + // graph-v3 + auto g3 = make_csr(edges, n); + std::vector d3(nv, std::numeric_limits::max()); + graph::dijkstra_shortest_distances( + g3, vertex_id_t{0}, graph::container_value_fn(d3), + weight_fn, graph::empty_visitor{}, + std::less{}, std::plus{}, + graph::use_default_heap{}, std::allocator{}); + + // BGL + auto g_bgl = make_bgl_csr(edges, n); + std::vector d_bgl(nv, std::numeric_limits::max()); + d_bgl[0] = 0.0; + run_bgl_dijkstra(g_bgl, vertex_id_t{0}, d_bgl); + + if (d3.size() != d_bgl.size()) return false; + for (std::size_t i = 0; i < nv; ++i) { + // Both write infinity for unreachable; otherwise distances must agree + // exactly (same edge weights, same compare/combine). + if (d3[i] != d_bgl[i]) return false; + } + return true; + }; + + bool ok = true; + ok &= check(erdos_renyi(n, 8.0 / static_cast(n))); + ok &= check(barabasi_albert(n, 4)); + ok &= check(path_graph(n)); + return ok; +} + +const bool kBglParityChecked = [] { + if (!check_bgl_distance_parity()) { + std::fprintf(stderr, + "FATAL: BGL vs graph-v3 distance parity check failed; " + "benchmarks would compare incorrect results.\n"); + std::abort(); + } + return true; +}(); + +} // namespace + +#endif // BENCH_BGL + +// --------------------------------------------------------------------------- +// Entry point +// --------------------------------------------------------------------------- + +BENCHMARK_MAIN(); diff --git a/benchmark/algorithms/bgl_dijkstra_fixtures.hpp b/benchmark/algorithms/bgl_dijkstra_fixtures.hpp new file mode 100644 index 0000000..1578c43 --- /dev/null +++ b/benchmark/algorithms/bgl_dijkstra_fixtures.hpp @@ -0,0 +1,132 @@ +/** + * @file bgl_dijkstra_fixtures.hpp + * @brief Boost.Graph (BGL) container builders for Dijkstra comparison benchmarks. + * + * Companion to dijkstra_fixtures.hpp. Builds BGL graphs from the *same* + * edge_list produced by the synthetic generators, so graph-v3 and BGL + * benchmarks operate on topologically identical graphs. + * + * compressed_graph (CSR) ↔ boost::compressed_sparse_row_graph + * dynamic_graph (vov) ↔ boost::adjacency_list + * + * Only compiled when BENCH_BGL is defined (set by CMake when the BGL include + * directory is available); see benchmark/algorithms/CMakeLists.txt. + * + * Phase 4.3 — sanity-check comparison only. BGL distance results must match + * graph-v3 distance results bit-for-bit on the same source vertex; the + * benchmarks assert this once at startup before timing starts. + */ + +#pragma once + +#include "dijkstra_fixtures.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace graph::benchmark { + +// --------------------------------------------------------------------------- +// Bundled edge property: a single double weight, mirroring graph-v3 layout. +// --------------------------------------------------------------------------- + +struct bgl_edge_prop { + double weight = 0.0; +}; + +// --------------------------------------------------------------------------- +// BGL container types +// --------------------------------------------------------------------------- + +/// CSR equivalent. directedS, no vertex bundle, edge bundle = bgl_edge_prop. +/// vertex_index is implicit (vecS storage). +using bgl_csr_graph_t = + boost::compressed_sparse_row_graph; + +/// adjacency_list equivalent of dynamic_graph: vector of vector, +/// directed, edge bundle stores the weight. +using bgl_adj_graph_t = + boost::adjacency_list; + +// --------------------------------------------------------------------------- +// Container builders +// --------------------------------------------------------------------------- + +/// Build a BGL CSR graph from a graph-v3 edge_list (already sorted by source). +inline bgl_csr_graph_t make_bgl_csr(const edge_list& edges, vertex_id_t num_vertices) { + // BGL CSR ctor wants a range of (vertex_id_t, vertex_id_t) pairs and a + // parallel range of edge bundles. + std::vector> pairs; + std::vector props; + pairs.reserve(edges.size()); + props.reserve(edges.size()); + for (const auto& e : edges) { + pairs.emplace_back(e.source_id, e.target_id); + props.push_back(bgl_edge_prop{static_cast(e.value)}); + } + return bgl_csr_graph_t(boost::edges_are_sorted, + pairs.begin(), pairs.end(), + props.begin(), + static_cast(num_vertices)); +} + +/// Build a BGL adjacency_list from the same edge_list. +inline bgl_adj_graph_t make_bgl_adj(const edge_list& edges, vertex_id_t num_vertices) { + bgl_adj_graph_t g(static_cast(num_vertices)); + for (const auto& e : edges) { + boost::add_edge(e.source_id, e.target_id, + bgl_edge_prop{static_cast(e.value)}, g); + } + return g; +} + +// --------------------------------------------------------------------------- +// Dijkstra wrapper +// +// Uses the no_color_map, no_init variant to match graph-v3 semantics: +// caller pre-initialises the distance vector; no per-call color-map +// allocation. Predecessor map is required by the BGL signature even when +// unused — wired to a dummy iterator_property_map. +// --------------------------------------------------------------------------- + +template +inline void run_bgl_dijkstra(const BglGraph& g, + vertex_id_t source, + std::vector& dist) { + using vd_t = typename boost::graph_traits::vertex_descriptor; + + // Predecessor scratch — required by the API but unused here. + std::vector pred(boost::num_vertices(g)); + + auto idx = boost::get(boost::vertex_index, g); + auto dist_pmap = boost::make_iterator_property_map(dist.begin(), idx); + auto pred_pmap = boost::make_iterator_property_map(pred.begin(), idx); + auto w_pmap = boost::get(&bgl_edge_prop::weight, g); + + boost::dijkstra_shortest_paths_no_color_map_no_init( + g, static_cast(source), + pred_pmap, + dist_pmap, + w_pmap, + idx, + std::less{}, + boost::closed_plus(), + std::numeric_limits::max(), + 0.0, + boost::default_dijkstra_visitor{}); +} + +} // namespace graph::benchmark diff --git a/benchmark/algorithms/dijkstra_fixtures.hpp b/benchmark/algorithms/dijkstra_fixtures.hpp new file mode 100644 index 0000000..114d9ca --- /dev/null +++ b/benchmark/algorithms/dijkstra_fixtures.hpp @@ -0,0 +1,267 @@ +/** + * @file dijkstra_fixtures.hpp + * @brief Synthetic graph generators for Dijkstra benchmarks. + * + * Provides graph generators that isolate three orthogonal axes: + * - Scale : V ∈ {1K, 10K, 100K} + * - Topology : Erdős–Rényi, 2D grid, Barabási–Albert, path + * - Weight dist : uniform, exponential, constant-1 + * + * Each generator returns a sorted edge_list (sorted by source_id, as + * required by compressed_graph). Pass the list to make_csr() or make_vov() + * to build the target container. + * + * Usage: + * auto edges = benchmark::erdos_renyi(10'000, 8.0 / 10'000); + * auto g = benchmark::make_csr(edges, 10'000); + * // ... run Dijkstra on g + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace graph::benchmark { + +// --------------------------------------------------------------------------- +// Common types +// --------------------------------------------------------------------------- + +using vertex_id_t = uint32_t; +using weight_t = double; +using edge_entry = graph::copyable_edge_t; +using edge_list = std::vector; + +/// Primary container: CSR layout; minimises traversal overhead so that +/// heap cost is the dominant measurable term. +using csr_graph_t = + graph::container::compressed_graph; + +/// Secondary container: vov-backed dynamic_graph; representative of typical +/// user code and used as a regression baseline. +using vov_graph_t = + graph::container::dynamic_graph>; + +// --------------------------------------------------------------------------- +// Weight distribution +// --------------------------------------------------------------------------- + +enum class weight_dist { + uniform, ///< U[1, 100] — default, "average case" + exponential, ///< Exp(0.1) + 1 — heavy left tail, more decrease-key events + constant_one, ///< Always 1 — BFS-equivalent floor, minimum variance +}; + +inline double sample_weight(std::mt19937_64& rng, weight_dist dist) { + switch (dist) { + case weight_dist::uniform: { + std::uniform_real_distribution d(1.0, 100.0); + return d(rng); + } + case weight_dist::exponential: { + std::exponential_distribution d(0.1); + return 1.0 + d(rng); + } + case weight_dist::constant_one: + default: + return 1.0; + } +} + +// --------------------------------------------------------------------------- +// Erdős–Rényi G(n, p) — directed, self-loops excluded +// +// Uses the O(E) geometric-skip algorithm (Batagelj & Brandes, 2005) instead +// of the naive O(n²) coin-flip loop, so it scales to n = 10⁶. +// +// The n*(n−1) ordered (u,v) pairs with u≠v are enumerated as positions +// pos ∈ [0, n*(n−1)) +// where position pos maps to: +// u = pos / (n−1) +// offset = pos % (n−1) +// v = offset < u ? offset : offset + 1 (skip self-loop) +// +// Set p = k / n for E/V ≈ k (sparse: k=2, moderate: k=8, dense: k=32). +// The resulting edge list is already sorted by source_id because positions +// are visited in ascending order and u is non-decreasing. +// --------------------------------------------------------------------------- + +inline edge_list erdos_renyi(vertex_id_t n, double p, uint64_t seed = 42, + weight_dist wdist = weight_dist::uniform) { + std::mt19937_64 rng(seed); + const size_t total = static_cast(n) * (n - 1); // n*(n-1) directed pairs + + edge_list edges; + const size_t expected = static_cast(total * p * 1.1) + 16; + edges.reserve(expected); + + // Geometric skip: sample the gap between consecutive selected positions. + // std::geometric_distribution gives the number of failures before + // the first success, so adding 1 gives the gap to the *next* success. + std::geometric_distribution geom(p); + + size_t pos = geom(rng); // 0-indexed position of the first selected edge + while (pos < total) { + const vertex_id_t u = static_cast(pos / (n - 1)); + const vertex_id_t offset = static_cast(pos % (n - 1)); + const vertex_id_t v = (offset < u) ? offset : offset + 1; + edges.push_back({u, v, sample_weight(rng, wdist)}); + pos += geom(rng) + 1; + } + // Edges are already sorted by source_id (u is non-decreasing). + return edges; +} + +// --------------------------------------------------------------------------- +// 2D grid graph (rows × cols) — bidirectional 4-connected +// +// Vertex (r, c) has id r*cols + c. +// Horizontal and vertical neighbour pairs each get two directed edges +// (both directions), giving E/V ≈ 4 for interior vertices. +// The returned list is sorted by source_id. +// --------------------------------------------------------------------------- + +inline edge_list grid_2d(vertex_id_t rows, vertex_id_t cols, uint64_t seed = 42, + weight_dist wdist = weight_dist::uniform) { + std::mt19937_64 rng(seed); + const vertex_id_t n = rows * cols; + + edge_list edges; + edges.reserve(4 * static_cast(n)); // upper bound + + for (vertex_id_t r = 0; r < rows; ++r) { + for (vertex_id_t c = 0; c < cols; ++c) { + vertex_id_t u = r * cols + c; + // Right neighbour + if (c + 1 < cols) { + vertex_id_t v = u + 1; + edges.push_back({u, v, sample_weight(rng, wdist)}); + edges.push_back({v, u, sample_weight(rng, wdist)}); + } + // Down neighbour + if (r + 1 < rows) { + vertex_id_t v = u + cols; + edges.push_back({u, v, sample_weight(rng, wdist)}); + edges.push_back({v, u, sample_weight(rng, wdist)}); + } + } + } + std::stable_sort(edges.begin(), edges.end(), + [](const edge_entry& a, const edge_entry& b) { + return a.source_id < b.source_id; + }); + return edges; +} + +// --------------------------------------------------------------------------- +// Barabási–Albert preferential attachment — scale-free / power-law +// +// Starts with a fully-connected seed of m0 = max(m, 2) vertices, then +// adds each subsequent vertex w by selecting m existing targets with +// probability proportional to their current degree ("urn" method). +// Both w→t and t→w directed edges are added so the graph is undirected +// in terms of reachability, which maximises relaxation traffic from hubs. +// The returned list is sorted by source_id. +// --------------------------------------------------------------------------- + +inline edge_list barabasi_albert(vertex_id_t n, vertex_id_t m, uint64_t seed = 42, + weight_dist wdist = weight_dist::uniform) { + std::mt19937_64 rng(seed); + + // "Urn" stores one entry per endpoint per edge, giving degree-proportional + // selection at O(1) per pick (trade memory for simplicity). + std::vector urn; + urn.reserve(2 * static_cast(n) * m); + + edge_list edges; + edges.reserve(2 * static_cast(n) * m); + + // Seed: fully-connected clique of m0 vertices + const vertex_id_t m0 = std::max(m, vertex_id_t{2}); + for (vertex_id_t u = 0; u < m0; ++u) { + for (vertex_id_t v = u + 1; v < m0; ++v) { + edges.push_back({u, v, sample_weight(rng, wdist)}); + edges.push_back({v, u, sample_weight(rng, wdist)}); + urn.push_back(u); + urn.push_back(v); + } + } + + for (vertex_id_t w = m0; w < n; ++w) { + std::vector chosen; + chosen.reserve(m); + + while (chosen.size() < m) { + std::uniform_int_distribution pick(0, urn.size() - 1); + vertex_id_t t = urn[pick(rng)]; + // chosen.size() ≤ m ≤ ~8 so linear scan is fine + bool already = (t == w); + for (auto x : chosen) already |= (x == t); + if (!already) { + chosen.push_back(t); + edges.push_back({w, t, sample_weight(rng, wdist)}); + edges.push_back({t, w, sample_weight(rng, wdist)}); + urn.push_back(w); + urn.push_back(t); + } + } + } + + std::stable_sort(edges.begin(), edges.end(), + [](const edge_entry& a, const edge_entry& b) { + return a.source_id < b.source_id; + }); + return edges; +} + +// --------------------------------------------------------------------------- +// Path graph: 0 → 1 → 2 → … → (n−1) +// +// Minimum decrease-key traffic: each vertex is relaxed at most once. +// Serves as a lower-bound sanity check. +// --------------------------------------------------------------------------- + +inline edge_list path_graph(vertex_id_t n, uint64_t seed = 42, + weight_dist wdist = weight_dist::uniform) { + std::mt19937_64 rng(seed); + edge_list edges; + edges.reserve(n > 0 ? n - 1 : 0); + + for (vertex_id_t u = 0; u + 1 < n; ++u) { + edges.push_back({u, u + 1, sample_weight(rng, wdist)}); + } + // Already sorted. + return edges; +} + +// --------------------------------------------------------------------------- +// Container builders +// --------------------------------------------------------------------------- + +/// Build a compressed_graph (CSR) from a pre-sorted edge list. +/// edges must be sorted ascending by source_id (enforced by assertion in +/// compressed_graph::load_edges). +inline csr_graph_t make_csr(const edge_list& edges, vertex_id_t num_vertices) { + csr_graph_t g; + g.load_edges(edges, std::identity{}, num_vertices); + return g; +} + +/// Build a vov dynamic_graph from an edge list (order does not matter). +inline vov_graph_t make_vov(const edge_list& edges, vertex_id_t num_vertices) { + vov_graph_t g; + g.load_edges(edges, std::identity{}, num_vertices); + return g; +} + +} // namespace graph::benchmark diff --git a/benchmark/data/README.md b/benchmark/data/README.md new file mode 100644 index 0000000..8355008 --- /dev/null +++ b/benchmark/data/README.md @@ -0,0 +1,73 @@ +# Benchmark Data + +This directory contains real-world graph data files used as validation +fixtures in the Dijkstra benchmark suite (Phase 0.2). The files are +large and are therefore **not committed** to the repository (see +`.gitignore`). Use the instructions below to download them before +running the real-world validation benchmarks. + +--- + +## Required files + +| Filename | Vertices | Edges | Source | Description | +|----------|----------|-------|--------|-------------| +| `roadNet-CA.txt` | 1,965,206 | 5,533,214 | SNAP | California road network – classic Dijkstra benchmark, planar/spatial | +| `web-Google.txt` | 875,713 | 5,105,039 | SNAP | Web-link graph – mixed degree distribution | + +--- + +## Download instructions + +### Stanford SNAP graphs + +```bash +# Create the data directory if it does not already exist +mkdir -p benchmark/data + +# California road network +curl -L "https://snap.stanford.edu/data/roadNet-CA.txt.gz" \ + | gunzip > benchmark/data/roadNet-CA.txt + +# Google web graph +curl -L "https://snap.stanford.edu/data/web-Google.txt.gz" \ + | gunzip > benchmark/data/web-Google.txt +``` + +Alternatively, download from and place +the decompressed `.txt` files in this directory. + +--- + +## File format + +SNAP edge-list files use the following format: + +``` +# Comment lines start with '#' +\t +``` + +Vertex ids are 0-based integers. The benchmark loader skips comment +lines and treats each remaining line as a directed edge. + +--- + +## Loader + +The fixture helper `benchmark/algorithms/dijkstra_fixtures.hpp` will +gain a `load_snap_graph()` function in Phase 0 that reads these files +and returns a sorted `edge_list`. For now, running the real-world +benchmarks requires that the files are present; if they are absent the +corresponding benchmark cases are skipped at runtime with a message. + +--- + +## License / attribution + +The SNAP graphs are distributed by the Stanford Network Analysis Project +under their respective licences. Please cite the original dataset when +publishing results. + +- Jure Leskovec and Andrej Krevl. *SNAP Datasets: Stanford Large Network + Dataset Collection*, , June 2014. diff --git a/include/graph/algorithm/dijkstra_shortest_paths.hpp b/include/graph/algorithm/dijkstra_shortest_paths.hpp index 2d60f0b..4df527c 100644 --- a/include/graph/algorithm/dijkstra_shortest_paths.hpp +++ b/include/graph/algorithm/dijkstra_shortest_paths.hpp @@ -15,16 +15,53 @@ #include "graph/graph.hpp" #include "graph/algorithm/traversal_common.hpp" #include "graph/adj_list/vertex_property_map.hpp" +#include "graph/detail/indexed_dary_heap.hpp" +#include "graph/detail/heap_position_map.hpp" #include #include #include +#include #ifndef GRAPH_DIJKSTRA_SHORTEST_PATHS_HPP # define GRAPH_DIJKSTRA_SHORTEST_PATHS_HPP namespace graph { +/** + * @brief Heap-selector tag: use the historical std::priority_queue path. + * + * Default selector for `dijkstra_shortest_paths`. Lazy-deletion priority queue; + * heap may grow to O(E) and stale entries are skipped at pop time. + * + * Recommended for: sparse graphs (E/V ≲ 4), grid-like topologies, path/tree + * graphs, and any workload with low decrease-key pressure. Phase 4 benchmarks + * showed this path wins by 20–40% on grid (E/V≈4) and path (E/V=1) workloads. + */ +struct use_default_heap {}; + +/** + * @brief Heap-selector tag: use the indexed d-ary heap with true decrease-key. + * + * Heap size is bounded by O(V); no stale pops. Supports both index_vertex_range + * graphs (dense vector_position_map) and mapped containers / hashable non-dense + * vertex ids (assoc_position_map). + * + * Recommended for: dense or hub-heavy graphs (E/V ≳ 8) where many edges trigger + * relaxation. Phase 4 benchmarks at 100K vertices on `compressed_graph`: + * Erdős–Rényi (E/V≈8) −25%, Barabási–Albert (E/V≈8) −17% with `Arity=8`. + * Loses 20–40% on grid/path workloads where decrease-key is rare. + * + * `Arity=8` is the recommended setting on x86_64 for high-E/V workloads; + * `Arity=4` matches Boost's `d_ary_heap_indirect`. + * + * @tparam Arity Children per node (default 4 — matches Boost's d_ary_heap_indirect). + */ +template +struct use_indexed_dary_heap { + static constexpr std::size_t arity = Arity; +}; + // Import CPOs and types for use in algorithms using adj_list::vertices; using adj_list::num_vertices; @@ -178,6 +215,7 @@ template < class Visitor = empty_visitor, class Compare = less>, class Combine = plus>, + class Heap = use_default_heap, class Alloc = std::allocator> requires distance_fn_for && // predecessor_fn_for && // @@ -195,7 +233,8 @@ constexpr void dijkstra_shortest_paths( Visitor&& visitor = empty_visitor(), Compare&& compare = less>(), Combine&& combine = plus>(), - const Alloc& alloc = Alloc()) { + Heap /*heap_tag*/ = Heap{}, + const Alloc& alloc = Alloc()) { using graph_type = std::remove_reference_t; using id_type = vertex_id_t; using distance_type = distance_fn_value_t; @@ -234,28 +273,7 @@ constexpr void dijkstra_shortest_paths( return false; }; - // Define and initialize the priority queue for Dijkstra's algorithm. We use a min-heap based on distance. - // - // NOTE: std::priority_queue lacks a decrease-key operation, so when a vertex's distance - // improves we re-insert it (lazy deletion). The earlier entry becomes stale and is - // skipped at pop time (see the stale-pop check in the main loop). This keeps the code - // simple but allows the heap to grow to O(E) entries in the worst case. - // - // A future optimization is to replace this with an indexed d-ary heap supporting true - // decrease-key (matching Boost's d_ary_heap_indirect with d=4). That would cap heap - // size at O(V), eliminate stale pops, and typically improve cache behavior. See - // agents/indexed_dary_heap_plan.md for the design and migration plan. - struct weighted_vertex { - vertex_t vertex_desc = {}; - distance_type weight = distance_type(); - }; - auto qcompare = [&compare](const weighted_vertex& a, const weighted_vertex& b) { - return compare(b.weight, a.weight); // min-heap: pop lowest weight first - }; - using WVAlloc = typename std::allocator_traits::template rebind_alloc; - using Queue = std::priority_queue, decltype(qcompare)>; - Queue queue(qcompare, std::vector(WVAlloc(alloc))); - + // Initialize-vertex visitor callbacks (shared across heap implementations). // (The optimizer removes this loop if on_initialize_vertex() is empty.) if constexpr (has_on_initialize_vertex || has_on_initialize_vertex_id) { for (auto&& [uid, u] : views::vertexlist(g)) { @@ -267,86 +285,248 @@ constexpr void dijkstra_shortest_paths( } } - // Seed the queue with the initial vertice(s) - for (auto&& seed_id : sources) { - auto seed_it = find_vertex(g, seed_id); - if (seed_it == std::ranges::end(vertices(g))) { - throw std::out_of_range(std::format("dijkstra_shortest_paths: source vertex id '{}' is out of range", seed_id)); - } - vertex_t seed = *seed_it; - - distance(g, seed_id) = zero; // mark seed_id as discovered - queue.push({seed, zero}); - if constexpr (has_on_discover_vertex) { - visitor.on_discover_vertex(g, seed); - } else if constexpr (has_on_discover_vertex_id) { - visitor.on_discover_vertex(g, seed_id); - } - } + // --------------------------------------------------------------------- + // Heap-implementation dispatch. + // + // - use_default_heap : std::priority_queue with lazy deletion. + // - use_indexed_dary_heap : indexed d-ary heap with true decrease-key + // (heap size bounded by O(V)). + // + // Both branches honour identical visitor semantics: on_examine_vertex and + // on_finish_vertex fire exactly once per reachable vertex; on_edge_relaxed + // and on_edge_not_relaxed fire exactly once per outgoing edge of every + // examined vertex. + // --------------------------------------------------------------------- + if constexpr (std::is_same_v) { + // ----------------------------------------------------------------- + // std::priority_queue path (legacy / default). + // + // std::priority_queue lacks a decrease-key operation, so when a vertex's + // distance improves we re-insert it (lazy deletion). The earlier entry + // becomes stale and is skipped at pop time. This keeps the code simple + // but allows the heap to grow to O(E) entries in the worst case. + // ----------------------------------------------------------------- + struct weighted_vertex { + vertex_t vertex_desc = {}; + distance_type weight = distance_type(); + }; + auto qcompare = [&compare](const weighted_vertex& a, const weighted_vertex& b) { + return compare(b.weight, a.weight); // min-heap: pop lowest weight first + }; + using WVAlloc = typename std::allocator_traits::template rebind_alloc; + using Queue = std::priority_queue, decltype(qcompare)>; + Queue queue(qcompare, std::vector(WVAlloc(alloc))); - // Main loop to process the queue - while (!queue.empty()) { - auto [u, w] = queue.top(); - queue.pop(); - const id_type uid = vertex_id(g, u); - - // Skip stale queue entries: because std::priority_queue lacks decrease-key, - // we re-insert vertices when their distance is improved. The earlier (larger) - // entry is still in the heap and must be ignored when popped. This also - // ensures on_examine_vertex / on_finish_vertex fire exactly once per vertex, - // matching BGL visitor semantics. - if (compare(distance(g, uid), w)) { - continue; - } + // Seed the queue with the initial vertice(s) + for (auto&& seed_id : sources) { + auto seed_it = find_vertex(g, seed_id); + if (seed_it == std::ranges::end(vertices(g))) { + throw std::out_of_range(std::format("dijkstra_shortest_paths: source vertex id '{}' is out of range", seed_id)); + } + vertex_t seed = *seed_it; - if constexpr (has_on_examine_vertex) { - visitor.on_examine_vertex(g, u); - } else if constexpr (has_on_examine_vertex_id) { - visitor.on_examine_vertex(g, uid); + distance(g, seed_id) = zero; // mark seed_id as discovered + queue.push({seed, zero}); + if constexpr (has_on_discover_vertex) { + visitor.on_discover_vertex(g, seed); + } else if constexpr (has_on_discover_vertex_id) { + visitor.on_discover_vertex(g, seed_id); + } } - // Process all outgoing edges from the current vertex - for (auto&& [vid, uv] : views::incidence(g, u)) { - if constexpr (has_on_examine_edge) { - visitor.on_examine_edge(g, uv); + // Main loop to process the queue + while (!queue.empty()) { + auto [u, w] = queue.top(); + queue.pop(); + const id_type uid = vertex_id(g, u); + + // Skip stale queue entries: because std::priority_queue lacks decrease-key, + // we re-insert vertices when their distance is improved. The earlier (larger) + // entry is still in the heap and must be ignored when popped. This also + // ensures on_examine_vertex / on_finish_vertex fire exactly once per vertex, + // matching BGL visitor semantics. + if (compare(distance(g, uid), w)) { + continue; } - // Use the user-supplied comparator for "undiscovered" detection so that - // custom Compare orderings remain consistent (matches BGL's - // !distance_compare(neighbor_distance, infinity)). - const bool is_neighbor_undiscovered = !compare(distance(g, vid), infinite); - const bool was_edge_relaxed = relax_target(uv, uid); + if constexpr (has_on_examine_vertex) { + visitor.on_examine_vertex(g, u); + } else if constexpr (has_on_examine_vertex_id) { + visitor.on_examine_vertex(g, uid); + } - if (was_edge_relaxed) { - if constexpr (has_on_edge_relaxed) { - visitor.on_edge_relaxed(g, uv); + // Process all outgoing edges from the current vertex + for (auto&& [vid, uv] : views::incidence(g, u)) { + if constexpr (has_on_examine_edge) { + visitor.on_examine_edge(g, uv); } - vertex_t v = target(g, uv); - if (is_neighbor_undiscovered) { - if constexpr (has_on_discover_vertex) { - visitor.on_discover_vertex(g, v); - } else if constexpr (has_on_discover_vertex_id) { - visitor.on_discover_vertex(g, vid); + + // Use the user-supplied comparator for "undiscovered" detection so that + // custom Compare orderings remain consistent (matches BGL's + // !distance_compare(neighbor_distance, infinity)). + const bool is_neighbor_undiscovered = !compare(distance(g, vid), infinite); + const bool was_edge_relaxed = relax_target(uv, uid); + + if (was_edge_relaxed) { + if constexpr (has_on_edge_relaxed) { + visitor.on_edge_relaxed(g, uv); + } + vertex_t v = target(g, uv); + if (is_neighbor_undiscovered) { + if constexpr (has_on_discover_vertex) { + visitor.on_discover_vertex(g, v); + } else if constexpr (has_on_discover_vertex_id) { + visitor.on_discover_vertex(g, vid); + } + } + queue.push({v, distance(g, vid)}); + } else { + if constexpr (has_on_edge_not_relaxed) { + visitor.on_edge_not_relaxed(g, uv); } } - queue.push({v, distance(g, vid)}); - } else { - if constexpr (has_on_edge_not_relaxed) { - visitor.on_edge_not_relaxed(g, uv); + } + + // The stale-pop skip at the top of the loop guarantees we only reach this + // point on the settled (final) pop of u, so on_examine_vertex and + // on_finish_vertex are each called exactly once per reachable vertex, + // matching BGL visitor semantics. + if constexpr (has_on_finish_vertex) { + visitor.on_finish_vertex(g, u); + } else if constexpr (has_on_finish_vertex_id) { + visitor.on_finish_vertex(g, uid); + } + } // while(!queue.empty()) + } else { + // ----------------------------------------------------------------- + // indexed d-ary heap path. + // + // True decrease-key: at most one heap entry per vertex (size <= V). + // No stale pops. Vertex distances are read live via DistanceFn so the + // heap order tracks the current best-known distance. + // + // The position-map adapter is selected at compile time: + // + // - index_vertex_range : vector_position_map (dense O(V) array). + // - mapped containers / non-dense ids + // : assoc_position_map (unordered_map). + // + // The vector adapter is faster (no hashing, contiguous storage) but + // requires vertex ids in [0, num_vertices(g)). Mapped containers + // (mov, mod, uov, ...) and any graph whose vertex_id_t is non-integral + // fall through to the associative adapter automatically. + // ----------------------------------------------------------------- + constexpr std::size_t arity = Heap::arity; + + // Live distance lookup for the heap (reads, never writes). + auto heap_distfn = [&g, &distance](const id_type& k) -> const distance_type& { + return distance(g, k); + }; + + // Seed + main loop, generic over the heap type so the dense and sparse + // position-map branches share a single body. + auto run = [&](auto& heap) { + // Seed the heap with the initial vertice(s). + for (auto&& seed_id : sources) { + auto seed_it = find_vertex(g, seed_id); + if (seed_it == std::ranges::end(vertices(g))) { + throw std::out_of_range( + std::format("dijkstra_shortest_paths: source vertex id '{}' is out of range", seed_id)); + } + + distance(g, seed_id) = zero; // mark seed_id as discovered + heap.push(static_cast(seed_id)); + if constexpr (has_on_discover_vertex) { + visitor.on_discover_vertex(g, *seed_it); + } else if constexpr (has_on_discover_vertex_id) { + visitor.on_discover_vertex(g, seed_id); } } - } - // The stale-pop skip at the top of the loop guarantees we only reach this - // point on the settled (final) pop of u, so on_examine_vertex and - // on_finish_vertex are each called exactly once per reachable vertex, - // matching BGL visitor semantics. - if constexpr (has_on_finish_vertex) { - visitor.on_finish_vertex(g, u); - } else if constexpr (has_on_finish_vertex_id) { - visitor.on_finish_vertex(g, uid); + // Main loop. With true decrease-key there are no stale entries: every + // pop yields the next finalized vertex. + while (!heap.empty()) { + const id_type uid = heap.top(); + heap.pop(); + vertex_t u = *find_vertex(g, uid); + + if constexpr (has_on_examine_vertex) { + visitor.on_examine_vertex(g, u); + } else if constexpr (has_on_examine_vertex_id) { + visitor.on_examine_vertex(g, uid); + } + + for (auto&& [vid, uv] : views::incidence(g, u)) { + if constexpr (has_on_examine_edge) { + visitor.on_examine_edge(g, uv); + } + + const bool is_neighbor_undiscovered = !compare(distance(g, vid), infinite); + const bool was_edge_relaxed = relax_target(uv, uid); + + if (was_edge_relaxed) { + if constexpr (has_on_edge_relaxed) { + visitor.on_edge_relaxed(g, uv); + } + if (is_neighbor_undiscovered) { + if constexpr (has_on_discover_vertex) { + vertex_t v = target(g, uv); + visitor.on_discover_vertex(g, v); + } else if constexpr (has_on_discover_vertex_id) { + visitor.on_discover_vertex(g, vid); + } + heap.push(vid); + } else { + // v has finite distance and was just improved; under Dijkstra's + // non-negative-weight invariant a finalized vertex cannot be + // relaxed, so v must still be in the heap. + heap.decrease(vid); + } + } else { + if constexpr (has_on_edge_not_relaxed) { + visitor.on_edge_not_relaxed(g, uv); + } + } + } + + if constexpr (has_on_finish_vertex) { + visitor.on_finish_vertex(g, u); + } else if constexpr (has_on_finish_vertex_id) { + visitor.on_finish_vertex(g, uid); + } + } // while(!heap.empty()) + }; + + using HeapAlloc = typename std::allocator_traits::template rebind_alloc; + + if constexpr (adj_list::index_vertex_range) { + // ---- Dense path: vector_position_map ---- + // The position vector uses the default allocator; the user-supplied + // Alloc is forwarded only to the heap's internal storage (matching the + // documented role of Alloc as "internal priority queue storage"). + std::vector positions(num_vertices(g), detail::vector_position_map::npos); + using HeapT = detail::indexed_dary_heap; + HeapT heap(heap_distfn, compare, + detail::vector_position_map{positions}, + HeapAlloc(alloc)); + run(heap); + } else { + // ---- Sparse / mapped path: assoc_position_map ---- + static_assert(adj_list::hashable_vertex_id, + "use_indexed_dary_heap requires either index_vertex_range or a " + "hashable vertex_id_t for the associative position-map adapter."); + + using PMap = detail::assoc_position_map; + typename PMap::map_type positions; + positions.reserve(num_vertices(g)); + + using HeapT = detail::indexed_dary_heap; + HeapT heap(heap_distfn, compare, PMap{positions}, HeapAlloc(alloc)); + run(heap); } - } // while(!queue.empty()) + } // if constexpr Heap dispatch } /** @@ -368,6 +548,7 @@ template < class Visitor = empty_visitor, class Compare = less>, class Combine = plus>, + class Heap = use_default_heap, class Alloc = std::allocator> requires distance_fn_for && // predecessor_fn_for && // @@ -384,9 +565,11 @@ constexpr void dijkstra_shortest_paths( Visitor&& visitor = empty_visitor(), Compare&& compare = less>(), Combine&& combine = plus>(), - const Alloc& alloc = Alloc()) { + Heap heap_tag = Heap{}, + const Alloc& alloc = Alloc()) { dijkstra_shortest_paths(g, subrange(&source, (&source + 1)), distance, predecessor, weight, - forward(visitor), forward(compare), forward(combine), alloc); + forward(visitor), forward(compare), forward(combine), + heap_tag, alloc); } /** @@ -431,6 +614,7 @@ template < class Visitor = empty_visitor, class Compare = less>, class Combine = plus>, + class Heap = use_default_heap, class Alloc = std::allocator> requires distance_fn_for && // convertible_to, vertex_id_t> && // @@ -446,9 +630,10 @@ constexpr void dijkstra_shortest_distances( Visitor&& visitor = empty_visitor(), Compare&& compare = less>(), Combine&& combine = plus>(), - const Alloc& alloc = Alloc()) { + Heap heap_tag = Heap{}, + const Alloc& alloc = Alloc()) { dijkstra_shortest_paths(g, sources, distance, _null_predecessor, forward(weight), forward(visitor), - forward(compare), forward(combine), alloc); + forward(compare), forward(combine), heap_tag, alloc); } /** @@ -471,6 +656,7 @@ template < class Visitor = empty_visitor, class Compare = less>, class Combine = plus>, + class Heap = use_default_heap, class Alloc = std::allocator> requires distance_fn_for && // basic_edge_weight_function, Compare, Combine> @@ -485,9 +671,11 @@ constexpr void dijkstra_shortest_distances( Visitor&& visitor = empty_visitor(), Compare&& compare = less>(), Combine&& combine = plus>(), - const Alloc& alloc = Alloc()) { + Heap heap_tag = Heap{}, + const Alloc& alloc = Alloc()) { dijkstra_shortest_paths(g, subrange(&source, (&source + 1)), distance, _null_predecessor, forward(weight), - forward(visitor), forward(compare), forward(combine), alloc); + forward(visitor), forward(compare), forward(combine), + heap_tag, alloc); } } // namespace graph diff --git a/include/graph/algorithm/mst.hpp b/include/graph/algorithm/mst.hpp index b0ca2d5..84638a3 100644 --- a/include/graph/algorithm/mst.hpp +++ b/include/graph/algorithm/mst.hpp @@ -194,17 +194,30 @@ * **Performance Notes:** * * **Prim's Priority Queue:** - * This implementation uses a binary heap (std::priority_queue) which provides O(E log V) - * complexity. While Fibonacci heap implementations achieve better theoretical complexity - * O(E + V log V), they have significantly higher constant factors and more complex - * bookkeeping. In practice: - * - **Binary heap is faster** for most real-world graphs (used here) - * - **Fibonacci heap** only wins for extremely dense graphs where E ≈ V² - * and the improved amortized decrease-key operation dominates - * - **Simple array** (O(V²)) is fastest for complete graphs where E = V(V-1)/2 - * - * Benchmark testing shows binary heap is optimal for graphs with 100-100,000 vertices - * and typical densities (E = O(V) to O(V^1.5)). + * `prim()` is implemented as a thin wrapper over `dijkstra_shortest_paths`, + * so it inherits the same `Heap` template parameter: + * + * - `use_default_heap` (default): `std::priority_queue` with lazy deletion. + * Provides O(E log V). Good general-purpose choice. + * - `use_indexed_dary_heap`: indexed d-ary heap with true O(log_D V) + * decrease-key. Recommended opt-in for high-E/V random / scale-free + * workloads on `compressed_graph` (typically `D = 8`); see Dijkstra + * Phase 4 results. + * + * Implementation note: because Prim's relaxation criterion is + * `compare(w_uv, weight[v])` rather than Dijkstra's + * `compare(d_u + w_uv, distance[v])`, Prim does not satisfy the + * monotonicity invariant Dijkstra's main loop assumes. `prim()` therefore + * maintains a small `std::vector finalized(V)` and wraps `weight_fn` + * so finalized targets report `+infinity`, which prevents the relax step + * from corrupting `weight[]` (the MST output) or calling `decrease()` on + * a vertex that has already been popped. See + * `agents/indexed_dary_heap_plan.md` § Phase 5.2 for the full discussion + * and the standalone-Prim alternative ("Option 2"). + * + * Fibonacci heap implementations achieve O(E + V log V) but have higher + * constant factors and are not used here. A simple array (O(V²)) is fastest + * only for complete graphs. * * --- * @@ -257,6 +270,8 @@ #include "graph/algorithm/dijkstra_shortest_paths.hpp" #include #include +#include +#include #ifndef GRAPH_MST_HPP # define GRAPH_MST_HPP @@ -896,6 +911,7 @@ template (const std::remove_reference_t&, const edge_t&)>, class CompareOp = less>, + class Heap = use_default_heap, class Alloc = std::allocator> requires distance_fn_for && is_arithmetic_v> && @@ -909,21 +925,104 @@ auto prim(G&& g, // graph [](const auto& gr, const edge_t& uv) { return edge_value(gr, uv); }, // default weight_fn(g, uv) -> edge_value(g, uv) - CompareOp compare = less>(), // edge value comparator - const Alloc& alloc = Alloc() + CompareOp compare = less>(), // edge value comparator + Heap heap_tag = Heap{}, // heap selector (use_default_heap or use_indexed_dary_heap) + const Alloc& alloc = Alloc() ) { using edge_value_type = distance_fn_value_t; + using id_type = vertex_id_t; // Prim's combine: ignore accumulated distance, use edge weight directly. // This transforms Dijkstra's relaxation check from compare(d_u + w, d_v) // to compare(w, d_v), which is exactly Prim's criterion. auto prim_combine = [](edge_value_type /*d_u*/, edge_value_type w_uv) -> edge_value_type { return w_uv; }; - dijkstra_shortest_paths(g, seed, - std::forward(weight), - std::forward(predecessor), - std::forward(weight_fn), empty_visitor(), - std::forward(compare), prim_combine, alloc); + // --------------------------------------------------------------------- + // Prim correctness shim ("Option 1" in indexed_dary_heap_plan.md § 5.2). + // + // Dijkstra's main loop relies on the monotonicity invariant: + // with non-negative weights and combine = plus, a finalized vertex's + // distance can never be improved later. + // It therefore omits a "skip if finalized" guard in the relax step. + // + // Prim's combine (return w_uv, ignoring d_u) breaks that invariant: + // a finalized vertex v carries weight[v] = w_xv (the cheapest tree-edge + // found before v was popped). A later-popped neighbor y may present an + // edge y -> v with w_yv < weight[v]; relaxing it would *overwrite* the + // MST output (weight[] is the result, not a working distance), and with + // the indexed heap would call decrease() on a vertex whose heap position + // is npos -> out-of-bounds. + // + // Fix: track which vertices have been finalized and force weight_fn to + // report +infinity for any edge whose target is already finalized. The + // relax then computes compare(infinite, weight[v]) = false and the edge + // is skipped. This is correct for Prim because by definition a finalized + // vertex is already in the MST and cannot accept a cheaper tree-edge. + // + // The bitset add ~1 bit per vertex of memory and one predictable + // branch + bit-load per edge in the inner loop. See "Option 2" in the + // plan for a faster but more invasive standalone-Prim alternative. + // + // Storage strategy: for graphs with index_vertex_range (dense integer + // ids in [0, num_vertices)), a std::vector indexed by id is fast + // and cache-friendly. For sparse / mapped graphs whose ids are not + // contiguous (or not integral), an unordered_set is used instead. + // --------------------------------------------------------------------- + using GraphT = std::remove_reference_t; + if constexpr (adj_list::index_vertex_range) { + std::vector finalized(num_vertices(g), false); + + struct prim_finish_visitor { + std::vector* finalized_ptr; + void on_finish_vertex(const GraphT& /*gr*/, const id_type& uid) const { + (*finalized_ptr)[static_cast(uid)] = true; + } + }; + prim_finish_visitor visitor{&finalized}; + + auto wf_ref = std::ref(weight_fn); + auto guarded_weight_fn = [&finalized, wf_ref]( + const GraphT& gr, const edge_t& uv) -> edge_value_type { + const id_type vid = target_id(gr, uv); + if (finalized[static_cast(vid)]) { + return infinite_distance(); + } + return wf_ref.get()(gr, uv); + }; + + dijkstra_shortest_paths(g, seed, + std::forward(weight), + std::forward(predecessor), + guarded_weight_fn, visitor, + std::forward(compare), prim_combine, heap_tag, alloc); + } else { + std::unordered_set finalized; + finalized.reserve(num_vertices(g)); + + struct prim_finish_visitor { + std::unordered_set* finalized_ptr; + void on_finish_vertex(const GraphT& /*gr*/, const id_type& uid) const { + finalized_ptr->insert(uid); + } + }; + prim_finish_visitor visitor{&finalized}; + + auto wf_ref = std::ref(weight_fn); + auto guarded_weight_fn = [&finalized, wf_ref]( + const GraphT& gr, const edge_t& uv) -> edge_value_type { + const id_type vid = target_id(gr, uv); + if (finalized.contains(vid)) { + return infinite_distance(); + } + return wf_ref.get()(gr, uv); + }; + + dijkstra_shortest_paths(g, seed, + std::forward(weight), + std::forward(predecessor), + guarded_weight_fn, visitor, + std::forward(compare), prim_combine, heap_tag, alloc); + } // Calculate total MST weight by summing edge weights edge_value_type total_weight = edge_value_type{}; diff --git a/include/graph/detail/heap_position_map.hpp b/include/graph/detail/heap_position_map.hpp new file mode 100644 index 0000000..3a61695 --- /dev/null +++ b/include/graph/detail/heap_position_map.hpp @@ -0,0 +1,114 @@ +/** + * @file heap_position_map.hpp + * @brief Position-map adapters for indexed_dary_heap. + * + * Two adapters are provided: + * + * - vector_position_map : O(1) lookup for integral keys in a known dense + * range [0, n). Backed by a caller-owned + * std::vector. + * + * - assoc_position_map : O(1) average lookup for arbitrary hashable keys. + * Backed by a caller-owned std::unordered_map. + * Use this when vertex ids are sparse, non-integral, + * or come from a mapped graph container. + * + * Both adapters store a pointer to their backing storage; the storage must + * outlive the heap. This lets the caller reuse the same map across multiple + * Dijkstra runs (call reset() between runs). + * + * Concept (informal): + * - sentinel: static constexpr size_t npos + * - size_t position(Key) const // returns npos if not present + * - void set_position(Key, size_t) // npos means "remove" + */ + +#pragma once + +#include +#include +#include +#include + +namespace graph::detail { + +// --------------------------------------------------------------------------- +// vector_position_map +// +// O(1) position map for integral keys in [0, n). The caller owns the storage +// vector, sized to n and initialised to npos. set_position(k, npos) marks k +// as absent. reset() clears the entire map in O(n). +// --------------------------------------------------------------------------- + +class vector_position_map { +public: + static constexpr std::size_t npos = static_cast(-1); + + explicit vector_position_map(std::vector& storage) noexcept + : storage_(&storage) {} + + template + [[nodiscard]] std::size_t position(const Key& k) const noexcept { + return (*storage_)[static_cast(k)]; + } + + template + void set_position(const Key& k, std::size_t pos) noexcept { + (*storage_)[static_cast(k)] = pos; + } + + /// Reset all entries to npos. O(n). + void reset() noexcept { std::fill(storage_->begin(), storage_->end(), npos); } + + [[nodiscard]] std::size_t capacity() const noexcept { return storage_->size(); } + +private: + std::vector* storage_; +}; + +// --------------------------------------------------------------------------- +// assoc_position_map +// +// O(1) average position map for hashable keys (e.g. when vertex ids come from +// a mapped graph and are non-contiguous, or non-integral entirely). +// +// Storage is a caller-owned std::unordered_map. set_position +// with npos erases the key, keeping the map's size equal to the heap's size +// at all times — so contains(k) reduces to a single lookup. +// --------------------------------------------------------------------------- + +template , + class KeyEq = std::equal_to, + class Alloc = std::allocator>> +class assoc_position_map { +public: + using map_type = std::unordered_map; + static constexpr std::size_t npos = static_cast(-1); + + explicit assoc_position_map(map_type& storage) noexcept : storage_(&storage) {} + + [[nodiscard]] std::size_t position(const Key& k) const { + auto it = storage_->find(k); + return (it == storage_->end()) ? npos : it->second; + } + + void set_position(const Key& k, std::size_t pos) { + if (pos == npos) { + storage_->erase(k); + } else { + // Use insert_or_assign for O(1) amortised update with no temporary. + storage_->insert_or_assign(k, pos); + } + } + + /// Drop all entries. O(n). + void reset() noexcept(noexcept(storage_->clear())) { storage_->clear(); } + + [[nodiscard]] std::size_t tracked_size() const noexcept { return storage_->size(); } + +private: + map_type* storage_; +}; + +} // namespace graph::detail diff --git a/include/graph/detail/indexed_dary_heap.hpp b/include/graph/detail/indexed_dary_heap.hpp new file mode 100644 index 0000000..68c681c --- /dev/null +++ b/include/graph/detail/indexed_dary_heap.hpp @@ -0,0 +1,291 @@ +/** + * @file indexed_dary_heap.hpp + * @brief External-key, indirect-comparison d-ary min-heap with O(log_d N) + * decrease-key. + * + * Designed for Dijkstra and Prim where: + * - Vertex ids serve as stable external keys. + * - Distances live in a user-supplied container, accessed via a callable + * @c DistanceFn(Key) -> const Distance& . + * - The relax step needs O(log_d N) `decrease(key)` rather than O(N) re-push. + * + * The heap stores keys only. Distances are read live through @c DistanceFn so + * the heap never goes stale: when the algorithm updates a distance and calls + * @c decrease(k), the heap re-orders k against the current distance values. + * + * Position bookkeeping (key → heap index) is delegated to a @c PositionMap. + * A heap of N entries always has exactly one position recorded per contained + * key; non-contained keys map to @c npos. Every write to @c heap_[i] funnels + * through @c place_() to keep the map in sync. + * + * Complexity (Arity = d): + * - push : O(log_d N) + * - pop : O(d · log_d N) + * - decrease : O(log_d N) + * - top : O(1) + * - contains : O(1) lookup in the position map + * + * d = 4 minimises the product (d / log d) on typical Dijkstra workloads; + * see Boost.Graph's `d_ary_heap_indirect` and references therein. + * + * Concept-style requirements on @c PositionMap: + * std::size_t pm.position(Key) const; // returns indexed_dary_heap::npos if not present + * void pm.set_position(Key, std::size_t); + * + * Two adapters are provided in @c heap_position_map.hpp: + * - @c vector_position_map for dense integral keys + * - @c assoc_position_map for sparse / hashable keys + */ + +#pragma once + +#include "heap_position_map.hpp" + +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// GRAPH_DETAIL_FORCE_INLINE +// +// Force-inline attribute applied to the heap's hot helpers: less_than_, +// place_, sift_up_, sift_down_. +// +// Background (Phase 4.3b, agents/indexed_dary_heap_results.md): +// VTune software-mode hotspots on MSVC /O2 showed sift_down_ as a distinct +// 31% call frame, with std::less::operator() appearing as three +// separate non-inlined copies and container_value_fn::operator() as a real +// call — together ~50% of CPU time on Grid_Idx4/100K. +// +// Investigation (Phases 4.3c–d): +// - Annotating only less_than_/place_ (result 004): no effect — the outer +// sift_down_ call frame still dominates; inner force-inline is local to +// the sift body and does not collapse the outer boundary. +// - Annotating sift_down_/sift_up_ as well (result 005): also no effect — +// MSVC silently ignores __forceinline on functions of this complexity +// regardless of the annotation when the call site is a large template +// instantiation. The /O2 /Ob2 inline budget is the real blocker. +// +// The annotations are kept as-is because: +// (a) They are a no-op on GCC/Clang (already inlined at -O2+). +// (b) They document intent and may become effective with /Ob3 or a +// future MSVC version. +// (c) The next investigative step is to try /Ob3 in the release preset +// (see agents/indexed_dary_heap_results.md Phase 4.3d next steps). +// --------------------------------------------------------------------------- +#if defined(_MSC_VER) +# define GRAPH_DETAIL_FORCE_INLINE __forceinline +#elif defined(__GNUC__) || defined(__clang__) +# define GRAPH_DETAIL_FORCE_INLINE [[gnu::always_inline]] inline +#else +# define GRAPH_DETAIL_FORCE_INLINE inline +#endif + + +namespace graph::detail { + +// --------------------------------------------------------------------------- +// indexed_dary_heap +// --------------------------------------------------------------------------- + +// Arity is intentionally a compile-time template parameter, not a runtime value. +// The performance benefit of a d-ary heap over a binary heap comes from +// reducing tree height (fewer cache misses on decrease-key) while keeping the +// inner child-scan loop tight enough to fit in registers. That inner loop +// iterates over exactly Arity children in sift_down and is the hottest path +// during Dijkstra. A compile-time Arity allows the compiler to fully unroll +// it, elide the loop counter, and apply SIMD optimisations. A runtime arity +// would turn it into a variable-count loop and forfeit those gains. +// The default Arity=4 minimises the product (d / log2 d) on typical +// Dijkstra workloads; see Boost.Graph's d_ary_heap_indirect and the +// analysis in agents/indexed_dary_heap_plan.md § Open Questions. +template > +class indexed_dary_heap { + static_assert(Arity >= 2, "Arity must be at least 2"); + +public: + using key_type = Key; + using size_type = std::size_t; + using distance_fn = DistanceFn; + using compare_type = Compare; + using position_map = PositionMap; + using allocator_type = Allocator; + + static constexpr size_type arity = Arity; + static constexpr size_type npos = static_cast(-1); + + indexed_dary_heap(DistanceFn dist, Compare comp, PositionMap pmap, + const Allocator& alloc = Allocator()) + : heap_(alloc), distance_(std::move(dist)), compare_(std::move(comp)), + position_(std::move(pmap)) {} + + // ----- size / state ---------------------------------------------------- + + [[nodiscard]] bool empty() const noexcept { return heap_.empty(); } + [[nodiscard]] size_type size() const noexcept { return heap_.size(); } + + void reserve(size_type n) { heap_.reserve(n); } + + /// Remove all entries. Resets each contained key's position to npos. + void clear() noexcept { + for (const auto& k : heap_) { + position_.set_position(k, npos); + } + heap_.clear(); + } + + // ----- queries --------------------------------------------------------- + + /// O(1). Returns the key with the smallest distance under @c Compare. + /// Precondition: !empty(). + [[nodiscard]] const Key& top() const noexcept { return heap_.front(); } + + /// O(1). True iff @c k is currently in the heap. + [[nodiscard]] bool contains(const Key& k) const noexcept { + return position_.position(k) != npos; + } + + // ----- modifiers ------------------------------------------------------- + + /// O(log_d N). Insert @c k. Behaviour is undefined if @c k is already + /// present — callers should use @c decrease() for re-insertions. + void push(const Key& k) { + const size_type i = heap_.size(); + heap_.push_back(k); + position_.set_position(k, i); + sift_up_(i); + } + + /// O(d · log_d N). Remove the top element. + /// Precondition: !empty(). + void pop() { + const Key removed = heap_.front(); + position_.set_position(removed, npos); + + const size_type last = heap_.size() - 1; + if (last == 0) { + heap_.pop_back(); + return; + } + // Move last → root, then sift down. + place_(0, heap_[last]); + heap_.pop_back(); + sift_down_(0); + } + + /// O(log_d N). Notify the heap that @c k's distance has decreased + /// (under @c Compare). Sifts @c k up only. + /// Precondition: contains(k). + void decrease(const Key& k) { + const size_type i = position_.position(k); + sift_up_(i); + } + + /// Equivalent to @c push(k) if !contains(k), else @c decrease(k). + /// Convenience wrapper for the common Dijkstra relax pattern. + void push_or_decrease(const Key& k) { + const size_type i = position_.position(k); + if (i == npos) { + push(k); + } else { + sift_up_(i); + } + } + + // ----- accessors (mostly for testing / introspection) ------------------ + + [[nodiscard]] const PositionMap& position_map_ref() const noexcept { return position_; } + [[nodiscard]] PositionMap& position_map_ref() noexcept { return position_; } + +private: + // ----------------------------------------------------------------------- + // Heap topology helpers + // ----------------------------------------------------------------------- + + static constexpr size_type parent_of_(size_type i) noexcept { + return (i - 1) / Arity; + } + static constexpr size_type first_child_of_(size_type i) noexcept { + return Arity * i + 1; + } + + /// Place @c k at index @c i and update the position map. Single point of + /// truth for `heap_[i] = k` — guarantees position_ stays consistent. + GRAPH_DETAIL_FORCE_INLINE + void place_(size_type i, const Key& k) { + heap_[i] = k; + position_.set_position(k, i); + } + + /// Strict-less wrapper using the user's Compare on distances. Every sift + /// loop calls through here so that one inlining decision (this function) + /// is enough to collapse the entire comparator chain to a single compare + /// instruction — see GRAPH_DETAIL_FORCE_INLINE rationale above. + [[nodiscard]] GRAPH_DETAIL_FORCE_INLINE + bool less_than_(const Key& a, const Key& b) const { + return compare_(distance_(a), distance_(b)); + } + + // ----------------------------------------------------------------------- + // Sift operations + // + // Implemented "hole-style": instead of swap-walking, we pull the original + // value out, walk the hole, then drop the value into its final slot. Saves + // one write per level vs. a naive swap loop. + // ----------------------------------------------------------------------- + + void sift_up_(size_type i) { + if (i == 0) return; + const Key k = heap_[i]; + while (i > 0) { + const size_type p = parent_of_(i); + if (!less_than_(k, heap_[p])) { + break; + } + place_(i, heap_[p]); // move parent down into the hole + i = p; + } + place_(i, k); + } + + void sift_down_(size_type i) { + const size_type n = heap_.size(); + if (n == 0) return; + const Key k = heap_[i]; + + while (true) { + const size_type first = first_child_of_(i); + if (first >= n) break; + + // Find the smallest child in [first, first + Arity). + const size_type last = (first + Arity < n) ? first + Arity : n; + size_type best = first; + for (size_type c = first + 1; c < last; ++c) { + if (less_than_(heap_[c], heap_[best])) { + best = c; + } + } + + if (!less_than_(heap_[best], k)) { + break; // k is no greater than its smallest child → done + } + place_(i, heap_[best]); // promote the smallest child into the hole + i = best; + } + place_(i, k); + } + + std::vector heap_; + DistanceFn distance_; + Compare compare_; + PositionMap position_; +}; + +} // namespace graph::detail diff --git a/scripts/perf/README.md b/scripts/perf/README.md new file mode 100644 index 0000000..6d1f0ff --- /dev/null +++ b/scripts/perf/README.md @@ -0,0 +1,91 @@ +# scripts/perf - performance investigation tooling + +Helpers for running benchmarks, parsing VTune output, indexing exe symbols, +and pulling targeted disassembly across both MSVC (`dumpbin`) and GCC +(`objdump`). Built to support `agents/csr_edge_value_perf_plan.md` and +`agents/thread_b_linux_runbook.md`. + +All scripts are stdlib-only (Python 3.10+). + +## Files + +| Script | OS | Purpose | +|---|---|---| +| `bench_run.py` | both | Run a benchmark filter with core-pinning + High priority; emit median rows as JSON. | +| `bench_compare.py` | both | Diff two `bench_run.py` JSONs as a markdown delta table. | +| `vtune_top.py` | both | Parse a VTune CSV hotspots report; emit a normalized top-N. | +| `sym_index.py` | win | Disk-cached `dumpbin /disasm:nobytes` parser. ~30s cold, ~0.5s warm. | +| `find_func.py` | win | Symbol search wrapper around `sym_index`; supports `--regex`. | +| `disasm_func.py` | win | Single-function disasm via `dumpbin /range:`. | +| `capture_asm.py` | win | Bulk-dump a manifest of functions in one cache-warm pass. | +| `objdump_capture.py` | linux | Linux/GCC counterpart of `capture_asm.py` using `nm` + `objdump`. | +| `linux_gcc_capture.sh`| linux | One-shot runbook driver: bench + perf-stat + objdump. | + +## Avoiding cmd-redirection of `<` and `>` + +Use `--regex` instead of `--pattern` for any filter that needs angle +brackets. Even better, replace them with `.` wildcards so the arg is plain +text (e.g. `use_indexed_dary_heap.4.` instead of `use_indexed_dary_heap<4>`). +Both `disasm_func.py`, `find_func.py`, and the manifests in +`agents/perf_capture_manifest*.txt` follow this convention. + +## Cache files + +`sym_index.py` writes `.symidx.json` next to the exe. Cache is +invalidated automatically when the exe's size or mtime changes. +`objdump_capture.py` does the same on Linux. + +## Workflow examples + +### One-shot bulk capture (used to populate `artifacts/perf/msvc_profile/`) + +```pwsh +# From a vcvars64 shell. +python scripts/perf/capture_asm.py ` + --exe build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe ` + --manifest agents/perf_capture_manifest.txt ` + --out-dir artifacts/perf/msvc_profile +``` + +### Hotspot table from a VTune collection + +```pwsh +& "C:\Program Files (x86)\Intel\oneAPI\vtune\latest\bin64\vtune.exe" ` + -collect hotspots -knob sampling-mode=sw ` + -result-dir vtune/hot_001 -- ` + build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe ` + --benchmark_filter="BM_Dijkstra_CSR_Grid_Idx4/100000" --benchmark_min_time=15s + +& "C:\Program Files (x86)\Intel\oneAPI\vtune\latest\bin64\vtune.exe" ` + -report hotspots -r vtune/hot_001 -format csv > artifacts/perf/hot_001.csv + +python scripts/perf/vtune_top.py --csv artifacts/perf/hot_001.csv --top 15 --markdown +``` + +### Bench A vs bench B + +```pwsh +python scripts/perf/bench_run.py --exe ... --filter ... ` + --label baseline --out artifacts/perf/baseline.json +# (apply change, rebuild) +python scripts/perf/bench_run.py --exe ... --filter ... ` + --label candidate --out artifacts/perf/candidate.json +python scripts/perf/bench_compare.py ` + --baseline artifacts/perf/baseline.json ` + --candidate artifacts/perf/candidate.json ` + --threshold 5 +``` + +### Linux/WSL counterpart capture + +See `agents/thread_b_linux_runbook.md`. The one-liner is: + +```bash +bash scripts/perf/linux_gcc_capture.sh +``` + +## Output convention + +All scripts write either to stdout or to `--out `. The convention used +in `agents/`-side docs is `artifacts/perf//...`. +`artifacts/` is gitignored — these are working captures, not source-of-truth. diff --git a/scripts/perf/__init__.py b/scripts/perf/__init__.py new file mode 100644 index 0000000..7e17005 --- /dev/null +++ b/scripts/perf/__init__.py @@ -0,0 +1,6 @@ +# Performance investigation tooling. +# +# See scripts/perf/README.md for usage. These scripts orchestrate Google +# Benchmark runs, parse VTune CSV exports, and target dumpbin output to +# support the work documented in agents/csr_edge_value_perf_plan.md and +# agents/indexed_dary_heap_results.md. diff --git a/scripts/perf/bench_compare.py b/scripts/perf/bench_compare.py new file mode 100644 index 0000000..93ff33e --- /dev/null +++ b/scripts/perf/bench_compare.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +""" +bench_compare.py — diff two bench_run.py JSON outputs as a markdown table. + +Joins on (benchmark name, aggregate). Default aggregate is `median`. +Emits a markdown table with absolute times and Δ%, plus regression / win flags. + +Example: + python scripts/perf/bench_compare.py \ + --baseline artifacts/grid_ob2.json \ + --candidate artifacts/grid_ob3.json \ + --threshold 5 +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Optional + + +def _load(path: Path) -> dict: + return json.loads(path.read_text()) + + +def _index_rows(payload: dict, agg: str) -> dict[str, float]: + return { + r["name"]: r["real_time_ns"] + for r in payload["rows"] + if r["aggregate"] == agg + } + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--baseline", type=Path, required=True) + ap.add_argument("--candidate", type=Path, required=True) + ap.add_argument("--aggregate", default="median", + choices=["mean", "median", "stddev", "cv"], + help="Aggregate to compare (default median).") + ap.add_argument("--threshold", type=float, default=5.0, + help="Δ%% above which to flag regression (⚠) or win (✅).") + ap.add_argument("--label-baseline", default=None, + help="Override column header for baseline (default: from JSON).") + ap.add_argument("--label-candidate", default=None, + help="Override column header for candidate.") + ap.add_argument("--out", type=Path, help="Write markdown to file (default stdout).") + args = ap.parse_args() + + base = _load(args.baseline) + cand = _load(args.candidate) + base_rows = _index_rows(base, args.aggregate) + cand_rows = _index_rows(cand, args.aggregate) + + label_base = args.label_baseline or base.get("label") or args.baseline.stem + label_cand = args.label_candidate or cand.get("label") or args.candidate.stem + + # union of keys, sorted + keys = sorted(set(base_rows) | set(cand_rows)) + if not keys: + raise SystemExit(f"no rows with aggregate={args.aggregate!r} in either file") + + lines = [ + f"| Benchmark | {label_base} (ns) | {label_cand} (ns) | Δ % |", + "|---|---:|---:|---:|", + ] + for k in keys: + b = base_rows.get(k) + c = cand_rows.get(k) + if b is None: + lines.append(f"| {k} | — | {c:,.0f} | new |") + continue + if c is None: + lines.append(f"| {k} | {b:,.0f} | — | dropped |") + continue + delta = (c - b) / b * 100.0 + flag = "" + if delta >= args.threshold: + flag = " ⚠" + elif delta <= -args.threshold: + flag = " ✅" + lines.append(f"| {k} | {b:,.0f} | {c:,.0f} | {delta:+.1f} %{flag} |") + + text = "\n".join(lines) + "\n" + if args.out: + args.out.parent.mkdir(parents=True, exist_ok=True) + args.out.write_text(text) + print(f"wrote table to {args.out}") + else: + print(text) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/perf/bench_run.py b/scripts/perf/bench_run.py new file mode 100644 index 0000000..21a94ae --- /dev/null +++ b/scripts/perf/bench_run.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +""" +bench_run.py — run a benchmark filter, capture median rows as JSON. + +Wraps the manual core-pin / priority-High / 5-rep / median pattern used +throughout Phase 4.x perf work, and emits a structured result instead of +PowerShell `Select-String "median"` plumbing. + +Example: + python scripts/perf/bench_run.py \ + --exe build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe \ + --filter "BM_Dijkstra_(CSR|BGL_CSR)_Grid(_Idx4)?/(10000|100000)$" \ + --reps 5 --min-time 2s \ + --out artifacts/bench_grid_msvc_profile.json +""" + +from __future__ import annotations + +import argparse +import ctypes +import json +import re +import subprocess +import sys +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Optional + + +# Windows process-priority constants +PRIORITY_HIGH = 0x00000080 + + +@dataclass +class BenchRow: + name: str # benchmark name, e.g. BM_Dijkstra_CSR_Grid_Idx4/100000 + aggregate: str # mean / median / stddev / cv + real_time_ns: float + cpu_time_ns: float + iterations: int + + +# Lines look like: +# BM_Dijkstra_CSR_Grid_Idx4/100000_median 7440434 ns 7280759 ns 5 +# stddev rows have ns; cv rows have %. +_ROW_RE = re.compile( + r"^(?PBM_\S+?)_(?Pmean|median|stddev|cv)\s+" + r"(?P\S+)\s+(?:ns|%)\s+(?P\S+)\s+(?:ns|%)\s+(?P\d+)\s*$" +) + + +def parse_rows(text: str) -> list[BenchRow]: + rows: list[BenchRow] = [] + for line in text.splitlines(): + m = _ROW_RE.match(line) + if not m: + continue + try: + rows.append( + BenchRow( + name=m.group("name"), + aggregate=m.group("agg"), + real_time_ns=float(m.group("rt")), + cpu_time_ns=float(m.group("cpu")), + iterations=int(m.group("iter")), + ) + ) + except ValueError: + # silently skip malformed + pass + return rows + + +def _set_affinity_and_priority(pid: int, affinity_mask: int) -> None: + """Pin process to cores in `affinity_mask` and set HIGH priority. Windows-only.""" + if sys.platform != "win32": + return + PROCESS_ALL_ACCESS = 0x1F0FFF + h = ctypes.windll.kernel32.OpenProcess(PROCESS_ALL_ACCESS, False, pid) + if not h: + print(f"warning: OpenProcess({pid}) failed", file=sys.stderr) + return + try: + if not ctypes.windll.kernel32.SetProcessAffinityMask(h, affinity_mask): + print(f"warning: SetProcessAffinityMask failed", file=sys.stderr) + if not ctypes.windll.kernel32.SetPriorityClass(h, PRIORITY_HIGH): + print(f"warning: SetPriorityClass failed", file=sys.stderr) + finally: + ctypes.windll.kernel32.CloseHandle(h) + + +def run_benchmark( + exe: Path, + bench_filter: str, + reps: int, + min_time: str, + affinity_mask: int = 0x1, + aggregates_only: bool = True, + extra_args: Optional[list[str]] = None, +) -> tuple[str, list[BenchRow]]: + args = [ + str(exe), + f"--benchmark_filter={bench_filter}", + f"--benchmark_min_time={min_time}", + f"--benchmark_repetitions={reps}", + ] + if aggregates_only: + args.append("--benchmark_report_aggregates_only=true") + if extra_args: + args.extend(extra_args) + + # Start suspended so we can pin before the first iteration. Easiest cross-version + # approach: start normally, immediately pin, then wait. The first ~ms of run loses + # the pin, but Google Benchmark's per-rep median and our 5-rep aggregate absorb it. + proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + _set_affinity_and_priority(proc.pid, affinity_mask) + out, _ = proc.communicate() + if proc.returncode != 0: + print(out, file=sys.stderr) + raise SystemExit(f"benchmark exited with {proc.returncode}") + return out, parse_rows(out) + + +def main() -> int: + ap = argparse.ArgumentParser(description="Run a benchmark filter and capture rows as JSON.") + ap.add_argument("--exe", type=Path, required=True, help="Path to benchmark executable.") + ap.add_argument("--filter", required=True, help="--benchmark_filter regex.") + ap.add_argument("--reps", type=int, default=5, help="--benchmark_repetitions (default 5).") + ap.add_argument("--min-time", default="2s", help="--benchmark_min_time (default 2s).") + ap.add_argument("--affinity", type=lambda s: int(s, 0), default=0x1, + help="Process affinity mask (default 0x1 = core 0).") + ap.add_argument("--out", type=Path, help="Write JSON to this path (default: stdout).") + ap.add_argument("--label", default="", help="Free-form label stored in the JSON output.") + ap.add_argument("--print-stdout", action="store_true", + help="Also print the raw benchmark stdout to this process's stderr.") + args = ap.parse_args() + + if not args.exe.exists(): + raise SystemExit(f"executable not found: {args.exe}") + + raw, rows = run_benchmark( + args.exe, + args.filter, + args.reps, + args.min_time, + affinity_mask=args.affinity, + ) + if args.print_stdout: + print(raw, file=sys.stderr) + + payload = { + "label": args.label, + "exe": str(args.exe), + "filter": args.filter, + "reps": args.reps, + "min_time": args.min_time, + "affinity_mask": hex(args.affinity), + "rows": [asdict(r) for r in rows], + } + text = json.dumps(payload, indent=2) + if args.out: + args.out.parent.mkdir(parents=True, exist_ok=True) + args.out.write_text(text) + print(f"wrote {len(rows)} rows to {args.out}") + else: + print(text) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/perf/capture_asm.py b/scripts/perf/capture_asm.py new file mode 100644 index 0000000..581ef32 --- /dev/null +++ b/scripts/perf/capture_asm.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +"""capture_asm.py - bulk-dump a curated list of functions into one directory. + +Manifest format (one capture per non-blank, non-# line; whitespace-separated): + + [:N] [substring1 substring2 ...] + + - basename: output filename stem + - :N optional 0-based index (default 0) to disambiguate when the + regex+substrings still match more than one symbol + - length_hex: how many bytes to disassemble from the symbol's RVA + - regex: Python re matched against the demangled symbol name + (use this for patterns containing < or >) + - substrings: AND-filtered after the regex match +""" + +from __future__ import annotations + +import argparse +import shlex +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from sym_index import disasm_range, filter_symbols, index_functions # noqa: E402 + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--exe", type=Path, required=True) + ap.add_argument("--manifest", type=Path, required=True) + ap.add_argument("--out-dir", type=Path, required=True) + ap.add_argument("--rebuild-cache", action="store_true") + args = ap.parse_args() + + syms = index_functions(args.exe, force_rebuild=args.rebuild_cache) + args.out_dir.mkdir(parents=True, exist_ok=True) + + n_ok = 0 + n_skip = 0 + for raw in args.manifest.read_text().splitlines(): + line = raw.strip() + if not line or line.startswith("#"): + continue + parts = shlex.split(line) + if len(parts) < 3: + print(f" skip (need basename, length, regex): {line}", file=sys.stderr) + n_skip += 1 + continue + + basename, length_str, first_regex, *rest = parts + + pick = 0 + if ":" in basename: + basename, pick_str = basename.rsplit(":", 1) + try: + pick = int(pick_str) + except ValueError: + print(f" skip (bad :N suffix on {basename!r}): {line}", file=sys.stderr) + n_skip += 1 + continue + + try: + length = int(length_str, 0) + except ValueError: + print(f" skip (bad length {length_str!r}): {line}", file=sys.stderr) + n_skip += 1 + continue + + matches = filter_symbols(syms, substrings=rest, regexes=[first_regex]) + if not matches: + print(f" no match: {basename} (regex={first_regex!r} subs={rest!r})", file=sys.stderr) + n_skip += 1 + continue + if pick >= len(matches): + print(f" skip ({basename}: pick={pick} but only {len(matches)} matches)", file=sys.stderr) + n_skip += 1 + continue + if len(matches) > 1 and pick == 0 and ":" not in raw.split()[0]: + short = matches[0].name if len(matches[0].name) <= 140 else matches[0].name[:140] + "..." + print(f" note: {basename}: {len(matches)} matches; using [0] ({short})", file=sys.stderr) + + sym = matches[pick] + asm = disasm_range(args.exe, sym.rva, sym.rva + length) + out_path = args.out_dir / f"{basename}.asm" + out_path.write_text(asm) + print(f" OK {basename:<32} 0x{sym.rva:x} pick={pick} {len(asm.splitlines())} lines") + n_ok += 1 + + print(f"\ncaptured {n_ok}, skipped {n_skip}", file=sys.stderr) + return 0 if n_skip == 0 else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/perf/disasm_func.py b/scripts/perf/disasm_func.py new file mode 100644 index 0000000..e2e4f1e --- /dev/null +++ b/scripts/perf/disasm_func.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +""" +disasm_func.py - disassemble one function from a Windows exe. + +Selects a function by --pattern (substring) and/or --regex (full Python re), +then dumps just that function's bytes via dumpbin /range. + +The symbol index is cached on disk by sym_index.py, so repeated invocations +on the same exe are near-instant after the first call. + +Avoid putting raw '<' or '>' on a Windows command line: cmd treats them as +redirection. Use --regex with escaped angle brackets instead, e.g.: + --regex 'use_indexed_dary_heap<4>' + +Example: + python scripts/perf/disasm_func.py \ + --exe build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe \ + --regex 'use_indexed_dary_heap<4>' --pattern sift_down_ \ + --out artifacts/perf/sift_down_idx4.asm +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from sym_index import ( # noqa: E402 + disasm_range, + filter_symbols, + index_functions, +) + + +def _truncate(name: str, n: int) -> str: + return name if len(name) <= n else name[:n] + "\u2026" + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--exe", type=Path, required=True) + ap.add_argument("--pattern", action="append", default=[], + help="substring filter (multiple = AND).") + ap.add_argument("--regex", action="append", default=[], + help="Python regex filter (multiple = AND). Use this for patterns with angle brackets.") + ap.add_argument("--list-only", action="store_true") + ap.add_argument("--length", type=lambda s: int(s, 0), default=0x1000) + ap.add_argument("--match-index", type=int, default=0) + ap.add_argument("--rebuild-cache", action="store_true") + ap.add_argument("--out", type=Path) + ap.add_argument("--no-truncate", action="store_true") + args = ap.parse_args() + + if not args.exe.exists(): + raise SystemExit(f"exe not found: {args.exe}") + if not args.pattern and not args.regex: + raise SystemExit("need at least one --pattern or --regex") + + syms = index_functions(args.exe, force_rebuild=args.rebuild_cache) + matches = filter_symbols(syms, args.pattern, args.regex) + if not matches: + sys.stderr.write(f"no symbols matched: patterns={args.pattern} regexes={args.regex}\n") + return 1 + + print(f"matches ({len(matches)} of {len(syms)} indexed):", file=sys.stderr) + for i, s in enumerate(matches[:30]): + name = s.name if args.no_truncate else _truncate(s.name, 200) + print(f" [{i}] 0x{s.rva:x} {name}", file=sys.stderr) + if len(matches) > 30: + print(f" ... +{len(matches) - 30} more", file=sys.stderr) + + if args.list_only: + return 0 + + if args.match_index >= len(matches): + raise SystemExit(f"--match-index {args.match_index} out of range (have {len(matches)})") + sym = matches[args.match_index] + asm = disasm_range(args.exe, sym.rva, sym.rva + args.length) + if args.out: + args.out.parent.mkdir(parents=True, exist_ok=True) + args.out.write_text(asm) + print(f"wrote {len(asm.splitlines())} lines to {args.out}", file=sys.stderr) + else: + print(asm) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/perf/find_func.py b/scripts/perf/find_func.py new file mode 100644 index 0000000..6846000 --- /dev/null +++ b/scripts/perf/find_func.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +"""find_func.py - search the cached symbol index by substring(s) and/or regex(es). + +Light wrapper for sym_index. Use --regex when the pattern needs angle +brackets (cmd treats < and > as redirection). + +Example: + python scripts/perf/find_func.py \ + --exe build/windows-msvc-profile/benchmark/algorithms/benchmark_dijkstra.exe \ + --pattern compressed_graph --pattern lambda_2 \ + --regex 'use_indexed_dary_heap<4>' +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from sym_index import filter_symbols, index_functions # noqa: E402 + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--exe", type=Path, required=True) + ap.add_argument("--pattern", action="append", default=[]) + ap.add_argument("--regex", action="append", default=[]) + ap.add_argument("--limit", type=int, default=20) + ap.add_argument("--no-truncate", action="store_true") + ap.add_argument("--rebuild-cache", action="store_true") + args = ap.parse_args() + + if not args.pattern and not args.regex: + raise SystemExit("need at least one --pattern or --regex") + + syms = index_functions(args.exe, force_rebuild=args.rebuild_cache) + matches = filter_symbols(syms, args.pattern, args.regex) + print(f"{len(matches)} match(es) of {len(syms)} indexed:") + for i, s in enumerate(matches[: args.limit]): + name = s.name if args.no_truncate else (s.name if len(s.name) <= 200 else s.name[:200] + "\u2026") + print(f" [{i}] 0x{s.rva:x} {name}") + if len(matches) > args.limit: + print(f" ... +{len(matches) - args.limit} more") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/perf/linux_gcc_capture.sh b/scripts/perf/linux_gcc_capture.sh new file mode 100644 index 0000000..7083516 --- /dev/null +++ b/scripts/perf/linux_gcc_capture.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# linux_gcc_capture.sh - capture the Linux/GCC counterpart of the MSVC reference +# in artifacts/perf/msvc_profile/. +# +# WSL has no hardware performance counters (PMU is not exposed), so this +# script intentionally avoids `perf stat -e cache-misses,...` and uses only: +# - wall-clock (Google Benchmark median across 5 reps) +# - software perf events that work in WSL +# - objdump for per-symbol disassembly comparison +# +# Run from the workspace root after cloning to a Linux/WSL machine: +# +# cmake --preset linux-gcc-release +# cmake --build --preset linux-gcc-release +# bash scripts/perf/linux_gcc_capture.sh +# +# Output lands in artifacts/perf/linux_gcc/, mirroring artifacts/perf/msvc_profile/. + +set -euo pipefail + +EXE="${1:-build/linux-gcc-release/benchmark/algorithms/benchmark_dijkstra}" +OUT_DIR="${2:-artifacts/perf/linux_gcc}" + +if [[ ! -x "$EXE" ]]; then + echo "ERROR: benchmark exe not found or not executable: $EXE" >&2 + echo "Usage: $0 [path/to/benchmark_dijkstra] [out_dir]" >&2 + exit 2 +fi + +mkdir -p "$OUT_DIR" +echo "==> capturing Linux/GCC reference into $OUT_DIR" + +# ---------- 1. wall-clock baseline ---------- +echo "--- 1. wall-clock baseline (5 reps, median, taskset core 4) ---" +taskset -c 4 python3 scripts/perf/bench_run.py \ + --exe "$EXE" \ + --filter 'BM_Dijkstra_(CSR|BGL_CSR)_(ER_Sparse|Grid|BA|Path)(_Idx4)?/(10000|100000)$' \ + --reps 5 --min-time 2s \ + --label "linux-gcc-release" \ + --out "$OUT_DIR/wallclock_baseline.json" + +# ---------- 2. software perf-stat counters (WSL-friendly) ---------- +# These software events do NOT need the PMU; they work in WSL. +SW_EVENTS="task-clock,context-switches,page-faults,cpu-migrations,instructions:u,cycles:u" +echo "--- 2. perf stat (software events, no PMU required) ---" +for bench in BM_Dijkstra_CSR_Grid_Idx4/100000 BM_Dijkstra_BGL_CSR_Grid/100000 \ + BM_Dijkstra_CSR_Path_Idx4/100000 BM_Dijkstra_BGL_CSR_Path/100000; do + safe="${bench//\//_}" + echo " perf stat $bench" + taskset -c 4 perf stat -e "$SW_EVENTS" -r 3 -- \ + "$EXE" --benchmark_filter="^${bench}$" --benchmark_min_time=3s \ + > "$OUT_DIR/perfstat_${safe}.stdout" \ + 2> "$OUT_DIR/perfstat_${safe}.stderr" || \ + echo " note: perf stat returned non-zero for $bench (may indicate no PMU)" >&2 +done + +# ---------- 3. objdump per-symbol captures ---------- +# GCC's objdump does the demangling MSVC's dumpbin does, but with --demangle. +echo "--- 3. objdump captures (mirrors MSVC manifest) ---" +python3 scripts/perf/objdump_capture.py \ + --exe "$EXE" \ + --manifest agents/perf_capture_manifest_linux.txt \ + --out-dir "$OUT_DIR" + +echo +echo "==> Linux capture complete. Diff against MSVC with:" +echo " python scripts/perf/bench_compare.py \\" +echo " --baseline artifacts/perf/msvc_profile/wallclock_baseline.json \\" +echo " --candidate $OUT_DIR/wallclock_baseline.json \\" +echo " --label-baseline msvc --label-candidate gcc" diff --git a/scripts/perf/objdump_capture.py b/scripts/perf/objdump_capture.py new file mode 100644 index 0000000..44d9b3e --- /dev/null +++ b/scripts/perf/objdump_capture.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +""" +objdump_capture.py - Linux/GCC counterpart of capture_asm.py. + +Same manifest format as capture_asm.py, but uses `objdump` instead of +`dumpbin` and operates on demangled C++ symbol names from `nm --demangle`. + +Required tools (Linux/WSL): + - objdump (binutils) + - nm (binutils) + +Manifest format (one capture per non-blank, non-# line): + + [:N] [substring1 substring2 ...] + +The Linux manifest (agents/perf_capture_manifest_linux.txt) parallels the +MSVC one but accounts for: + - Itanium mangling vs MSVC mangling (e.g. `_Z...` vs `??...`) + - GCC's tendency to emit `.cold` partitions (fold them into the main body + by extending the --length) + - BGL's d_ary_heap_indirect inlines preserve_heap_property_down on GCC, + so its capture often returns `` only. +""" + +from __future__ import annotations + +import argparse +import json +import re +import shlex +import shutil +import subprocess +import sys +from dataclasses import asdict, dataclass +from pathlib import Path + + +@dataclass +class Symbol: + name: str + addr: int + size: int = 0 + + +def _which(tool: str) -> str: + p = shutil.which(tool) + if not p: + raise SystemExit(f"{tool} not on PATH (need binutils on Linux/WSL)") + return p + + +def _cache_path(exe: Path) -> Path: + return exe.with_suffix(exe.suffix + ".symidx.json") + + +def _exe_fingerprint(exe: Path) -> dict: + st = exe.stat() + return {"path": str(exe), "size": st.st_size, "mtime_ns": st.st_mtime_ns} + + +# nm output line format with --demangle and -S (sizes): +# 0000000000003fa0 0000000000000123 T graph::dijkstra_shortest_paths<...>(...) +_NM_RE = re.compile( + r"^(?P[0-9a-f]+)\s+(?P[0-9a-f]+)\s+(?P[A-Za-z])\s+(?P.+)$" +) + + +def index_symbols(exe: Path, *, force_rebuild: bool = False) -> list[Symbol]: + cache = _cache_path(exe) + if not force_rebuild and cache.exists(): + try: + payload = json.loads(cache.read_text()) + if payload.get("fingerprint") == _exe_fingerprint(exe): + return [Symbol(**s) for s in payload["symbols"]] + except (OSError, json.JSONDecodeError, KeyError): + pass + + nm = _which("nm") + print(f"indexing functions in {exe.name} via nm --demangle ...", file=sys.stderr) + proc = subprocess.run( + [nm, "--demangle", "--print-size", "--defined-only", + "--no-sort", str(exe)], + capture_output=True, text=True, errors="replace", + ) + if proc.returncode != 0: + sys.stderr.write(proc.stderr) + raise SystemExit(f"nm failed ({proc.returncode})") + + syms: list[Symbol] = [] + for line in proc.stdout.splitlines(): + m = _NM_RE.match(line) + if not m: + continue + if m.group("type").lower() not in ("t", "w"): # text or weak text + continue + try: + syms.append(Symbol( + name=m.group("name"), + addr=int(m.group("addr"), 16), + size=int(m.group("size"), 16), + )) + except ValueError: + pass + + print(f" indexed {len(syms)} text symbols", file=sys.stderr) + try: + cache.write_text(json.dumps({ + "fingerprint": _exe_fingerprint(exe), + "symbols": [asdict(s) for s in syms], + })) + except OSError as e: + print(f" warning: failed to write cache: {e}", file=sys.stderr) + return syms + + +def filter_symbols( + syms, + substrings=(), + regexes=(), +): + sub = list(substrings) + rxs = [re.compile(r) for r in regexes] + out = [] + for s in syms: + if all(p in s.name for p in sub) and all(rx.search(s.name) for rx in rxs): + out.append(s) + return out + + +def disasm_range(exe: Path, start: int, end: int) -> str: + objdump = _which("objdump") + proc = subprocess.run( + [objdump, "-d", "--demangle", "--no-show-raw-insn", + f"--start-address=0x{start:x}", f"--stop-address=0x{end:x}", + str(exe)], + capture_output=True, text=True, errors="replace", + ) + if proc.returncode != 0: + sys.stderr.write(proc.stderr) + raise SystemExit(f"objdump failed ({proc.returncode})") + return proc.stdout + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--exe", type=Path, required=True) + ap.add_argument("--manifest", type=Path, required=True) + ap.add_argument("--out-dir", type=Path, required=True) + ap.add_argument("--rebuild-cache", action="store_true") + args = ap.parse_args() + + syms = index_symbols(args.exe, force_rebuild=args.rebuild_cache) + args.out_dir.mkdir(parents=True, exist_ok=True) + + n_ok = n_skip = 0 + for raw in args.manifest.read_text().splitlines(): + line = raw.strip() + if not line or line.startswith("#"): + continue + parts = shlex.split(line) + if len(parts) < 3: + print(f" skip (need basename, length, regex): {line}", file=sys.stderr) + n_skip += 1 + continue + + basename, length_str, first_regex, *rest = parts + pick = 0 + if ":" in basename: + basename, pick_str = basename.rsplit(":", 1) + try: + pick = int(pick_str) + except ValueError: + print(f" skip (bad :N suffix on {basename!r})", file=sys.stderr) + n_skip += 1 + continue + try: + length = int(length_str, 0) + except ValueError: + print(f" skip (bad length {length_str!r})", file=sys.stderr) + n_skip += 1 + continue + + matches = filter_symbols(syms, substrings=rest, regexes=[first_regex]) + if not matches: + print(f" no match: {basename} (regex={first_regex!r} subs={rest!r})", file=sys.stderr) + n_skip += 1 + continue + if pick >= len(matches): + print(f" skip ({basename}: pick={pick} but only {len(matches)} matches)", file=sys.stderr) + n_skip += 1 + continue + + sym = matches[pick] + # Use nm-reported size when available; else fall back to manifest length. + end = sym.addr + (sym.size if sym.size else length) + asm = disasm_range(args.exe, sym.addr, end) + out_path = args.out_dir / f"{basename}.asm" + out_path.write_text(asm) + print(f" OK {basename:<32} 0x{sym.addr:x} pick={pick} " + f"size={sym.size} {len(asm.splitlines())} lines") + n_ok += 1 + + print(f"\ncaptured {n_ok}, skipped {n_skip}", file=sys.stderr) + return 0 if n_skip == 0 else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/perf/sym_index.py b/scripts/perf/sym_index.py new file mode 100644 index 0000000..cc37956 --- /dev/null +++ b/scripts/perf/sym_index.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +""" +sym_index.py — disk-cached symbol index for a Windows exe. + +`dumpbin /disasm:nobytes` on a 1.4 MB benchmark takes ~30 s and returns +14k+ function entries. We need to query that table dozens of times during +a perf investigation; caching the parse result to a JSON file next to +the exe drops repeated lookups to <100 ms. + +Cache invalidation: by exe mtime+size in the cache header. +""" + +from __future__ import annotations + +import json +import re +import shutil +import subprocess +import sys +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Iterable + + +@dataclass +class Symbol: + name: str + rva: int # virtual address relative to image base + + +_FUNC_HEADER_RE = re.compile(r"^(?P[^\s].*?):\s*$") +_ADDR_RE = re.compile(r"^\s*(?P[0-9A-Fa-f]{8,16}):\s") + + +def _find_dumpbin() -> Path: + p = shutil.which("dumpbin") + if not p: + raise SystemExit( + "dumpbin not on PATH \u2014 run from a vcvars64 shell." + ) + return Path(p) + + +def _parse_dumpbin_output(text: str) -> list[Symbol]: + syms: list[Symbol] = [] + pending: str | None = None + for line in text.splitlines(): + if not line: + pending = None + continue + if pending is None: + if _ADDR_RE.match(line): + continue + m = _FUNC_HEADER_RE.match(line) + if not m: + continue + cand = m.group("name").strip() + if cand.startswith("Dump of"): + continue + head = cand.split()[0] if cand else "" + if "0x" in head: + continue + pending = cand + continue + m = _ADDR_RE.match(line) + if m: + try: + syms.append(Symbol(name=pending, rva=int(m.group("addr"), 16))) + except ValueError: + pass + pending = None + return syms + + +def _cache_path(exe: Path) -> Path: + return exe.with_suffix(exe.suffix + ".symidx.json") + + +def _exe_fingerprint(exe: Path) -> dict: + st = exe.stat() + return {"path": str(exe), "size": st.st_size, "mtime_ns": st.st_mtime_ns} + + +def _read_cache(exe: Path) -> list[Symbol] | None: + cache = _cache_path(exe) + if not cache.exists(): + return None + try: + payload = json.loads(cache.read_text()) + except (OSError, json.JSONDecodeError): + return None + if payload.get("fingerprint") != _exe_fingerprint(exe): + return None + return [Symbol(**s) for s in payload.get("symbols", [])] + + +def _write_cache(exe: Path, syms: list[Symbol]) -> None: + cache = _cache_path(exe) + payload = { + "fingerprint": _exe_fingerprint(exe), + "symbols": [asdict(s) for s in syms], + } + try: + cache.write_text(json.dumps(payload)) + except OSError as e: + print(f"warning: failed to write {cache}: {e}", file=sys.stderr) + + +def index_functions(exe: Path, *, force_rebuild: bool = False) -> list[Symbol]: + """Return the function-entry list for `exe`, caching the result on disk.""" + if not force_rebuild: + cached = _read_cache(exe) + if cached is not None: + return cached + + dumpbin = _find_dumpbin() + print(f"indexing functions in {exe.name} (one-time, ~30s) ...", file=sys.stderr) + proc = subprocess.run( + [str(dumpbin), "/disasm:nobytes", "/nologo", str(exe)], + capture_output=True, text=True, errors="replace", + ) + if proc.returncode != 0: + sys.stderr.write(proc.stderr) + raise SystemExit(f"dumpbin failed ({proc.returncode})") + + syms = _parse_dumpbin_output(proc.stdout) + print(f" indexed {len(syms)} functions; cached to {_cache_path(exe).name}", file=sys.stderr) + _write_cache(exe, syms) + return syms + + +def filter_symbols( + syms: Iterable[Symbol], + substrings: Iterable[str] = (), + regexes: Iterable[str] = (), + *, + include_ilt_thunks: bool = False, +) -> list[Symbol]: + """Return symbols matching ALL substrings AND ALL regexes. + + @ILT+... entries (incremental linker thunks - small forwarders, not real + bodies) are skipped by default; pass include_ilt_thunks=True to keep them. + """ + sub = list(substrings) + rxs = [re.compile(r) for r in regexes] + out: list[Symbol] = [] + for s in syms: + if not include_ilt_thunks and s.name.startswith("@ILT"): + continue + if all(p in s.name for p in sub) and all(rx.search(s.name) for rx in rxs): + out.append(s) + return out + + +def disasm_range(exe: Path, start: int, end: int) -> str: + dumpbin = _find_dumpbin() + proc = subprocess.run( + [str(dumpbin), "/disasm", "/nologo", + f"/range:0x{start:x},0x{end:x}", str(exe)], + capture_output=True, text=True, errors="replace", + ) + if proc.returncode != 0: + sys.stderr.write(proc.stderr) + raise SystemExit(f"dumpbin /range failed ({proc.returncode})") + return proc.stdout diff --git a/scripts/perf/vtune_top.py b/scripts/perf/vtune_top.py new file mode 100644 index 0000000..924c41f --- /dev/null +++ b/scripts/perf/vtune_top.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +""" +vtune_top.py — read a VTune `-format csv` hotspots report, emit clean top-N. + +Replaces the brittle PowerShell parsing used in earlier Phase 4.x runs: + $vtune -report hotspots -r -format csv | + +Symbol normalization rules: + graph::detail::indexed_dary_heap<...>::F → heap::F + graph::container_value_fn<...>::F → cfn::F + graph::detail::vector_position_map::F → pm::F + std::less<...>::F → less::F + std::vector::F → vector::F (drops the Alloc) + std::_Vector_iterator<...>::F → _Vector_iter::F + graph::views::incidence_view<...>::F → incidence_view::F + +Example: + vtune.exe -report hotspots -r vtune/hot_001 -format csv > hot.csv + python scripts/perf/vtune_top.py --csv hot.csv --top 15 +""" + +from __future__ import annotations + +import argparse +import csv +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + + +@dataclass +class Hotspot: + function: str + cpu_time: float + module: str = "" + source: str = "" + + +def _strip_template(name: str) -> str: + """Remove balanced angle-brackets from `name`.""" + out: list[str] = [] + depth = 0 + for ch in name: + if ch == "<": + depth += 1 + continue + if ch == ">": + depth = max(0, depth - 1) + continue + if depth == 0: + out.append(ch) + return "".join(out) + + +_NORMALIZERS: list[tuple[re.Pattern[str], str]] = [ + (re.compile(r"graph::detail::indexed_dary_heap<.*?>::"), "heap::"), + (re.compile(r"graph::container_value_fn<.*?>::"), "cfn::"), + (re.compile(r"graph::detail::vector_position_map::"), "pm::"), + (re.compile(r"std::less<.*?>::"), "less::"), + (re.compile(r"std::_Vector_iterator<.*?>::"), "_Vector_iter::"), + (re.compile(r"graph::views::incidence_view<.*?>::"), "incidence_view::"), + (re.compile(r"std::vector<([^,>]+),.*?>::"), r"vector<\1>::"), +] + + +def normalize(name: str) -> str: + s = name + for rx, repl in _NORMALIZERS: + s = rx.sub(repl, s) + # Trim crazy-long template instantiations on the bare end ("X<...>") that no rule matched. + if "<" in s and len(s) > 120: + s = _strip_template(s) + "<...>" + return s.strip() + + +def load_csv(path: Path) -> list[Hotspot]: + spots: list[Hotspot] = [] + # VTune CSV is tab-delimited despite the name. + with path.open(newline="", encoding="utf-8", errors="replace") as f: + reader = csv.reader(f, delimiter="\t") + header: Optional[list[str]] = None + for row in reader: + if not row or row[0].lower().startswith(("function", "vtune")): + if row and row[0].lower() == "function": + header = row + continue + if len(row) < 2: + continue + try: + cpu = float(row[1]) + except ValueError: + continue + module = row[5] if len(row) > 5 else "" + source = row[7] if len(row) > 7 else "" + spots.append(Hotspot(function=row[0], cpu_time=cpu, module=module, source=source)) + return spots + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--csv", type=Path, required=True, help="VTune hotspots CSV (tab-separated).") + ap.add_argument("--top", type=int, default=15, help="Rows to show (default 15).") + ap.add_argument("--no-normalize", action="store_true", + help="Skip the heap::/cfn::/etc. normalization rules.") + ap.add_argument("--markdown", action="store_true", + help="Emit a markdown table instead of plain text.") + args = ap.parse_args() + + spots = load_csv(args.csv) + if not spots: + print(f"no rows parsed from {args.csv}", file=sys.stderr) + return 1 + + total = sum(s.cpu_time for s in spots) + spots.sort(key=lambda s: s.cpu_time, reverse=True) + top = spots[: args.top] + + if args.markdown: + print(f"Total CPU collected: **{total:.2f} s** across {len(spots)} symbols\n") + print("| Rank | Function | CPU (s) | % |") + print("|---:|---|---:|---:|") + for i, s in enumerate(top, 1): + name = s.function if args.no_normalize else normalize(s.function) + pct = s.cpu_time / total * 100 if total else 0 + print(f"| {i} | `{name}` | {s.cpu_time:.3f} | {pct:.1f} |") + else: + print(f"Total CPU: {total:.2f} s across {len(spots)} symbols") + for i, s in enumerate(top, 1): + name = s.function if args.no_normalize else normalize(s.function) + pct = s.cpu_time / total * 100 if total else 0 + short = (name[:75] + "…") if len(name) > 76 else name + print(f" {i:2d}. {pct:5.1f} % {s.cpu_time:7.3f}s {short}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/algorithms/CMakeLists.txt b/tests/algorithms/CMakeLists.txt index 657e2ed..5126358 100644 --- a/tests/algorithms/CMakeLists.txt +++ b/tests/algorithms/CMakeLists.txt @@ -18,6 +18,8 @@ add_executable(test_algorithms test_jaccard.cpp test_scc_bidirectional.cpp test_tarjan_scc.cpp + test_indexed_dary_heap.cpp + test_dijkstra_indexed_heap.cpp ) target_link_libraries(test_algorithms diff --git a/tests/algorithms/test_dijkstra_indexed_heap.cpp b/tests/algorithms/test_dijkstra_indexed_heap.cpp new file mode 100644 index 0000000..26c7742 --- /dev/null +++ b/tests/algorithms/test_dijkstra_indexed_heap.cpp @@ -0,0 +1,483 @@ +/** + * @file test_dijkstra_indexed_heap.cpp + * @brief Phase 2 tests for the indexed d-ary heap path of dijkstra_shortest_paths. + * + * These tests: + * 1. Re-run core Dijkstra scenarios with `use_indexed_dary_heap<>` and + * assert identical distances/predecessors as the default-heap path. + * 2. Audit visitor call counts: examine, finish, edge-relaxed, and + * edge-not-relaxed events must match between the two heap paths + * (Phase 2.3 visitor-semantics audit). + */ + +#include +#include +#include +#include "../common/graph_fixtures.hpp" +#include "../common/algorithm_test_types.hpp" +#include "../common/map_graph_fixtures.hpp" +#include +#include +#include + +#include +#include + +using namespace graph; +using namespace graph::adj_list; +using namespace graph::test; +using namespace graph::test::fixtures; +using namespace graph::test::algorithm; + +namespace { + +// Visitor that records exact call counts for every event Dijkstra fires. +struct CountingVisitor { + int initialize = 0; + int discover = 0; + int examine = 0; + int finish = 0; + int relaxed = 0; + int not_relaxed = 0; + + template void on_initialize_vertex(const G&, const V&) { ++initialize; } + template void on_discover_vertex (const G&, const V&) { ++discover; } + template void on_examine_vertex (const G&, const V&) { ++examine; } + template void on_finish_vertex (const G&, const V&) { ++finish; } + template void on_edge_relaxed (const G&, const E&) { ++relaxed; } + template void on_edge_not_relaxed(const G&, const E&) { ++not_relaxed; } +}; + +} // namespace + +// --------------------------------------------------------------------------- +// Correctness: indexed heap produces the same distances as the default heap +// --------------------------------------------------------------------------- + +TEST_CASE("dijkstra(indexed_heap) - CLRS example matches default heap", + "[algorithm][dijkstra][indexed_heap]") { + using Graph = vov_weighted; + + auto g = clrs_dijkstra_graph(); + std::vector distance(num_vertices(g)); + std::vector> predecessor(num_vertices(g)); + init_shortest_paths(g, distance, predecessor); + + dijkstra_shortest_paths(g, vertex_id_t(0), + container_value_fn(distance), + container_value_fn(predecessor), + [](const auto& gr, const auto& uv) { return edge_value(gr, uv); }, + empty_visitor{}, + std::less{}, + std::plus{}, + use_indexed_dary_heap<4>{}, + std::allocator{}); + + for (size_t i = 0; i < clrs_dijkstra_results::distances_from_0.size(); ++i) { + CHECK(distance[i] == clrs_dijkstra_results::distances_from_0[i]); + } +} + +TEST_CASE("dijkstra(indexed_heap) - path graph", "[algorithm][dijkstra][indexed_heap]") { + using Graph = vov_weighted; + + auto g = path_graph_4_weighted(); + std::vector distance(num_vertices(g)); + std::vector> predecessor(num_vertices(g)); + init_shortest_paths(g, distance, predecessor); + + dijkstra_shortest_paths(g, vertex_id_t(0), + container_value_fn(distance), + container_value_fn(predecessor), + [](const auto& gr, const auto& uv) { return edge_value(gr, uv); }, + empty_visitor{}, + std::less{}, + std::plus{}, + use_indexed_dary_heap<>{}, + std::allocator{}); + + for (size_t i = 0; i < path_graph_4_results::num_vertices; ++i) { + CHECK(distance[i] == path_graph_4_results::distances[i]); + } +} + +TEST_CASE("dijkstra(indexed_heap) - multi-source CLRS", "[algorithm][dijkstra][indexed_heap]") { + using Graph = vov_weighted; + + auto g = clrs_dijkstra_graph(); + std::vector distance(num_vertices(g)); + std::vector> predecessor(num_vertices(g)); + init_shortest_paths(g, distance, predecessor); + + std::vector> sources = {0, 3}; + + dijkstra_shortest_paths(g, sources, + container_value_fn(distance), + container_value_fn(predecessor), + [](const auto& gr, const auto& uv) { return edge_value(gr, uv); }, + empty_visitor{}, + std::less{}, + std::plus{}, + use_indexed_dary_heap<>{}, + std::allocator{}); + + // Both sources start at distance 0; every vertex is reachable. + CHECK(distance[0] == 0); + CHECK(distance[3] == 0); + for (auto d : distance) CHECK(d != infinite_distance()); +} + +// Resolves Open Question 4: confirm that with multi-source seeding, the +// indexed-heap path fires on_examine_vertex / on_finish_vertex / on_discover +// the same number of times as the default-heap path. With non-negative +// weights, every source is pushed at distance 0 and finalized on its first +// pop (no later relax can lower distance below 0), so the visitor sees each +// vertex exactly once on both paths. +TEST_CASE("dijkstra(indexed_heap) - multi-source visitor parity vs default heap", + "[algorithm][dijkstra][indexed_heap]") { + using Graph = vov_weighted; + auto g = clrs_dijkstra_graph(); + const auto N = num_vertices(g); + std::vector> sources = {0, 3}; + + auto run = [&](auto heap_tag) { + std::vector distance(N); + std::vector> predecessor(N); + init_shortest_paths(g, distance, predecessor); + CountingVisitor v{}; + dijkstra_shortest_paths(g, sources, + container_value_fn(distance), + container_value_fn(predecessor), + [](const auto& gr, const auto& uv) { return edge_value(gr, uv); }, + v, + std::less{}, + std::plus{}, + heap_tag); + return std::tuple{v, distance}; + }; + + auto [v_def, d_def ] = run(use_default_heap{}); + auto [v_idx4, d_idx4] = run(use_indexed_dary_heap<4>{}); + auto [v_idx8, d_idx8] = run(use_indexed_dary_heap<8>{}); + + // Distances must agree (sanity). + CHECK(d_def == d_idx4); + CHECK(d_def == d_idx8); + + // Visitor parity. Each vertex examined / finished exactly once; each + // vertex (incl. each source) discovered exactly once. + CHECK(v_def.examine == static_cast(N)); + CHECK(v_def.finish == static_cast(N)); + CHECK(v_def.discover == static_cast(N)); + + CHECK(v_idx4.examine == v_def.examine); + CHECK(v_idx4.finish == v_def.finish); + CHECK(v_idx4.discover == v_def.discover); + CHECK(v_idx4.relaxed == v_def.relaxed); + CHECK(v_idx4.not_relaxed == v_def.not_relaxed); + + CHECK(v_idx8.examine == v_def.examine); + CHECK(v_idx8.finish == v_def.finish); + CHECK(v_idx8.discover == v_def.discover); + CHECK(v_idx8.relaxed == v_def.relaxed); + CHECK(v_idx8.not_relaxed == v_def.not_relaxed); +} + +TEST_CASE("dijkstra(indexed_heap) - distances-only overload", + "[algorithm][dijkstra][indexed_heap]") { + using Graph = vov_weighted; + + auto g = clrs_dijkstra_graph(); + std::vector distance(num_vertices(g)); + init_shortest_paths(g, distance); + + dijkstra_shortest_distances(g, vertex_id_t(0), + container_value_fn(distance), + [](const auto& gr, const auto& uv) { return edge_value(gr, uv); }, + empty_visitor{}, + std::less{}, + std::plus{}, + use_indexed_dary_heap<>{}, + std::allocator{}); + + for (size_t i = 0; i < clrs_dijkstra_results::distances_from_0.size(); ++i) { + CHECK(distance[i] == clrs_dijkstra_results::distances_from_0[i]); + } +} + +TEST_CASE("dijkstra(indexed_heap) - arity 2 and arity 8 produce same distances", + "[algorithm][dijkstra][indexed_heap]") { + using Graph = vov_weighted; + + auto g = clrs_dijkstra_graph(); + + std::vector d2(num_vertices(g)), d8(num_vertices(g)); + std::vector> p2(num_vertices(g)), p8(num_vertices(g)); + init_shortest_paths(g, d2, p2); + init_shortest_paths(g, d8, p8); + + auto wt = [](const auto& gr, const auto& uv) { return edge_value(gr, uv); }; + + dijkstra_shortest_paths(g, vertex_id_t(0), + container_value_fn(d2), container_value_fn(p2), + wt, empty_visitor{}, std::less{}, std::plus{}, + use_indexed_dary_heap<2>{}, std::allocator{}); + dijkstra_shortest_paths(g, vertex_id_t(0), + container_value_fn(d8), container_value_fn(p8), + wt, empty_visitor{}, std::less{}, std::plus{}, + use_indexed_dary_heap<8>{}, std::allocator{}); + + CHECK(d2 == d8); +} + +// --------------------------------------------------------------------------- +// Visitor call-count parity (Phase 2.3) +// --------------------------------------------------------------------------- + +TEST_CASE("dijkstra(indexed_heap) - visitor call counts match default heap", + "[algorithm][dijkstra][indexed_heap][visitor]") { + using Graph = vov_weighted; + + auto g = clrs_dijkstra_graph(); + auto wt = [](const auto& gr, const auto& uv) { return edge_value(gr, uv); }; + + CountingVisitor v_default; + CountingVisitor v_indexed; + + std::vector d_def(num_vertices(g)), d_idx(num_vertices(g)); + std::vector> p_def(num_vertices(g)), p_idx(num_vertices(g)); + init_shortest_paths(g, d_def, p_def); + init_shortest_paths(g, d_idx, p_idx); + + dijkstra_shortest_paths(g, vertex_id_t(0), + container_value_fn(d_def), container_value_fn(p_def), + wt, v_default, + std::less{}, std::plus{}, + use_default_heap{}, std::allocator{}); + + dijkstra_shortest_paths(g, vertex_id_t(0), + container_value_fn(d_idx), container_value_fn(p_idx), + wt, v_indexed, + std::less{}, std::plus{}, + use_indexed_dary_heap<>{}, std::allocator{}); + + // Distances must agree. + CHECK(d_def == d_idx); + + // Visitor call counts must agree exactly. Per Dijkstra invariants, every + // reachable vertex is examined and finished once, every outgoing edge of + // an examined vertex is either relaxed or not-relaxed exactly once, and + // discover fires once per reachable vertex. + CHECK(v_default.initialize == v_indexed.initialize); + CHECK(v_default.discover == v_indexed.discover); + CHECK(v_default.examine == v_indexed.examine); + CHECK(v_default.finish == v_indexed.finish); + CHECK(v_default.relaxed == v_indexed.relaxed); + CHECK(v_default.not_relaxed == v_indexed.not_relaxed); + + // Also assert the absolute invariant counts (5 reachable vertices in CLRS). + CHECK(v_indexed.examine == 5); + CHECK(v_indexed.finish == 5); + CHECK(v_indexed.discover == 5); +} + +TEST_CASE("dijkstra(indexed_heap) - visitor parity on path graph", + "[algorithm][dijkstra][indexed_heap][visitor]") { + using Graph = vov_weighted; + + auto g = path_graph_4_weighted(); + auto wt = [](const auto& gr, const auto& uv) { return edge_value(gr, uv); }; + + CountingVisitor v_default, v_indexed; + + std::vector d_def(num_vertices(g)), d_idx(num_vertices(g)); + std::vector> p_def(num_vertices(g)), p_idx(num_vertices(g)); + init_shortest_paths(g, d_def, p_def); + init_shortest_paths(g, d_idx, p_idx); + + dijkstra_shortest_paths(g, vertex_id_t(0), + container_value_fn(d_def), container_value_fn(p_def), + wt, v_default, + std::less{}, std::plus{}, + use_default_heap{}, std::allocator{}); + dijkstra_shortest_paths(g, vertex_id_t(0), + container_value_fn(d_idx), container_value_fn(p_idx), + wt, v_indexed, + std::less{}, std::plus{}, + use_indexed_dary_heap<>{}, std::allocator{}); + + CHECK(d_def == d_idx); + CHECK(v_default.discover == v_indexed.discover); + CHECK(v_default.examine == v_indexed.examine); + CHECK(v_default.finish == v_indexed.finish); + CHECK(v_default.relaxed == v_indexed.relaxed); + CHECK(v_default.not_relaxed == v_indexed.not_relaxed); +} + +// --------------------------------------------------------------------------- +// Source-out-of-range still throws on the indexed-heap path +// --------------------------------------------------------------------------- + +TEST_CASE("dijkstra(indexed_heap) - throws on out-of-range source", + "[algorithm][dijkstra][indexed_heap]") { + using Graph = vov_weighted; + + auto g = clrs_dijkstra_graph(); + std::vector distance(num_vertices(g)); + init_shortest_paths(g, distance); + + CHECK_THROWS_AS( + dijkstra_shortest_distances( + g, vertex_id_t(num_vertices(g) + 1), + container_value_fn(distance), + [](const auto& gr, const auto& uv) { return edge_value(gr, uv); }, + empty_visitor{}, std::less{}, std::plus{}, + use_indexed_dary_heap<>{}, std::allocator{}), + std::out_of_range); +} + +// --------------------------------------------------------------------------- +// Phase 3 - mapped-container support (assoc_position_map) +// +// SPARSE_VERTEX_TYPES are the map / unordered_map vertex containers +// (mov, mod, mol, uov, uod, uol). They do not satisfy index_vertex_range, +// so the indexed-heap path must select assoc_position_map automatically. +// --------------------------------------------------------------------------- + +TEMPLATE_TEST_CASE("dijkstra(indexed_heap) - sparse CLRS matches default heap", + "[algorithm][dijkstra][indexed_heap][sparse]", + SPARSE_VERTEX_TYPES) { + using Graph = TestType; + using id_type = vertex_id_t; + using namespace graph::test::map_fixtures; + + static_assert(!adj_list::index_vertex_range, + "SPARSE_VERTEX_TYPES must be mapped containers"); + + const auto& exp = clrs_dijkstra_sparse_expected{}; + auto g = map_fixtures::clrs_dijkstra_graph(); + + auto d_def = make_vertex_property_map(g, infinite_distance()); + auto p_def = make_vertex_property_map(g, id_type{}); + auto d_idx = make_vertex_property_map(g, infinite_distance()); + auto p_idx = make_vertex_property_map(g, id_type{}); + for (auto&& [uid, u] : views::vertexlist(g)) { + p_def[uid] = uid; + p_idx[uid] = uid; + } + + auto wt = [](const auto& gr, const auto& uv) { return edge_value(gr, uv); }; + + dijkstra_shortest_paths(g, id_type(exp.s), + container_value_fn(d_def), container_value_fn(p_def), + wt, empty_visitor{}, std::less{}, std::plus{}, + use_default_heap{}, std::allocator{}); + + dijkstra_shortest_paths(g, id_type(exp.s), + container_value_fn(d_idx), container_value_fn(p_idx), + wt, empty_visitor{}, std::less{}, std::plus{}, + use_indexed_dary_heap<>{}, std::allocator{}); + + // Distances must agree with the textbook results and across heap paths. + for (size_t i = 0; i < exp.num_vertices; ++i) { + CHECK(d_idx[exp.vertex_ids[i]] == exp.distances[i]); + CHECK(d_idx[exp.vertex_ids[i]] == d_def[exp.vertex_ids[i]]); + } + CHECK(p_idx[exp.s] == exp.s); +} + +TEMPLATE_TEST_CASE("dijkstra(indexed_heap) - sparse visitor parity", + "[algorithm][dijkstra][indexed_heap][sparse][visitor]", + SPARSE_VERTEX_TYPES) { + using Graph = TestType; + using id_type = vertex_id_t; + using namespace graph::test::map_fixtures; + + const auto& exp = clrs_dijkstra_sparse_expected{}; + auto g = map_fixtures::clrs_dijkstra_graph(); + + auto d_def = make_vertex_property_map(g, infinite_distance()); + auto d_idx = make_vertex_property_map(g, infinite_distance()); + + CountingVisitor v_default, v_indexed; + auto wt = [](const auto& gr, const auto& uv) { return edge_value(gr, uv); }; + + dijkstra_shortest_distances(g, id_type(exp.s), container_value_fn(d_def), + wt, v_default, std::less{}, std::plus{}, + use_default_heap{}, std::allocator{}); + dijkstra_shortest_distances(g, id_type(exp.s), container_value_fn(d_idx), + wt, v_indexed, std::less{}, std::plus{}, + use_indexed_dary_heap<>{}, std::allocator{}); + + CHECK(v_default.discover == v_indexed.discover); + CHECK(v_default.examine == v_indexed.examine); + CHECK(v_default.finish == v_indexed.finish); + CHECK(v_default.relaxed == v_indexed.relaxed); + CHECK(v_default.not_relaxed == v_indexed.not_relaxed); + + // 5 reachable vertices in the CLRS graph. + CHECK(v_indexed.examine == 5); + CHECK(v_indexed.finish == 5); +} + +// --------------------------------------------------------------------------- +// Phase 3 - non-integral vertex IDs (std::string keys) +// +// Exercises the assoc_position_map path with a hashable, non-integral key +// type. SPARSE_VERTEX_TYPES use uint32_t keys, so this test covers the +// remaining hashable_vertex_id branch of the if constexpr dispatch. +// --------------------------------------------------------------------------- + +TEST_CASE("dijkstra(indexed_heap) - string vertex IDs (CLRS topology)", + "[algorithm][dijkstra][indexed_heap][sparse][string_id]") { + using VId = std::string; + using Traits = graph::container::mov_graph_traits; + using Graph = graph::container::dynamic_graph; + + static_assert(!adj_list::index_vertex_range, + "string-keyed graph must not satisfy index_vertex_range"); + static_assert(adj_list::hashable_vertex_id, + "std::string must satisfy hashable_vertex_id"); + + // CLRS Figure 24.6 with string keys. + Graph g({{"s", "t", 10}, {"s", "y", 5}, + {"t", "x", 1}, {"t", "y", 2}, + {"x", "z", 4}, + {"y", "t", 3}, {"y", "x", 9}, {"y", "z", 2}, + {"z", "s", 7}, {"z", "x", 6}}); + + auto d_def = make_vertex_property_map(g, infinite_distance()); + auto p_def = make_vertex_property_map(g, VId{}); + auto d_idx = make_vertex_property_map(g, infinite_distance()); + auto p_idx = make_vertex_property_map(g, VId{}); + for (auto&& [uid, u] : views::vertexlist(g)) { + p_def[uid] = uid; + p_idx[uid] = uid; + } + + auto wt = [](const auto& gr, const auto& uv) { return edge_value(gr, uv); }; + + VId source{"s"}; + dijkstra_shortest_paths(g, source, + container_value_fn(d_def), container_value_fn(p_def), + wt, empty_visitor{}, std::less{}, std::plus{}, + use_default_heap{}, std::allocator{}); + + dijkstra_shortest_paths(g, source, + container_value_fn(d_idx), container_value_fn(p_idx), + wt, empty_visitor{}, std::less{}, std::plus{}, + use_indexed_dary_heap<>{}, std::allocator{}); + + // Textbook distances from CLRS Figure 24.6. + CHECK(d_idx["s"] == 0); + CHECK(d_idx["t"] == 8); + CHECK(d_idx["x"] == 9); + CHECK(d_idx["y"] == 5); + CHECK(d_idx["z"] == 7); + + // Indexed heap must agree with the default heap on every vertex. + for (auto&& [uid, u] : views::vertexlist(g)) { + CHECK(d_idx[uid] == d_def[uid]); + } + CHECK(p_idx["s"] == "s"); +} diff --git a/tests/algorithms/test_indexed_dary_heap.cpp b/tests/algorithms/test_indexed_dary_heap.cpp new file mode 100644 index 0000000..236148d --- /dev/null +++ b/tests/algorithms/test_indexed_dary_heap.cpp @@ -0,0 +1,324 @@ +/** + * @file test_indexed_dary_heap.cpp + * @brief Catch2 tests for graph::detail::indexed_dary_heap. + * + * Coverage: + * - Construction, empty, size + * - push / pop ordering (ascending and descending input) + * - decrease-key (single and repeated) + * - contains / clear + * - Both arity 2 and arity 4 + * - Custom comparator (max-heap via std::greater) + * - Both position-map adapters: vector_position_map, assoc_position_map + * - Random stress (1 000 keys + 500 decrease-key ops) + * - push_or_decrease convenience + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +using graph::detail::indexed_dary_heap; +using graph::detail::vector_position_map; +using graph::detail::assoc_position_map; + +namespace { + +// Helper: drain a heap into a vector of keys, preserving pop order. +template +std::vector drain(Heap& h) { + std::vector out; + while (!h.empty()) { + out.push_back(h.top()); + h.pop(); + } + return out; +} + +// Build a min-heap with vector_position_map over [0, dist.size()). +template > +auto make_vec_heap(std::vector& dist, + std::vector& pos, + Compare cmp = {}) { + pos.assign(dist.size(), vector_position_map::npos); + auto distfn = [&dist](unsigned k) -> const double& { return dist[k]; }; + return indexed_dary_heap( + distfn, cmp, vector_position_map{pos}); +} + +} // namespace + +// --------------------------------------------------------------------------- +// Basic construction / empty +// --------------------------------------------------------------------------- + +TEST_CASE("indexed_dary_heap: empty after construction", "[heap][indexed_dary_heap]") { + std::vector dist; + std::vector pos; + auto h = make_vec_heap(dist, pos); + + CHECK(h.empty()); + CHECK(h.size() == 0u); +} + +// --------------------------------------------------------------------------- +// push / pop ordering +// --------------------------------------------------------------------------- + +TEST_CASE("indexed_dary_heap: pops in ascending distance order", "[heap][indexed_dary_heap]") { + std::vector dist = {5.0, 2.0, 7.0, 1.0, 4.0}; + std::vector pos; + auto h = make_vec_heap(dist, pos); + + for (unsigned k = 0; k < dist.size(); ++k) { + h.push(k); + } + REQUIRE(h.size() == 5u); + + // Distances: 0→5, 1→2, 2→7, 3→1, 4→4 ⇒ expected key order: 3,1,4,0,2 + CHECK(drain(h) == std::vector{3, 1, 4, 0, 2}); +} + +TEST_CASE("indexed_dary_heap: descending input still pops ascending", "[heap][indexed_dary_heap]") { + std::vector dist = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + std::vector pos; + auto h = make_vec_heap(dist, pos); + + for (int k = 9; k >= 0; --k) { + h.push(static_cast(k)); + } + CHECK(drain(h) == std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); +} + +TEST_CASE("indexed_dary_heap: single element", "[heap][indexed_dary_heap]") { + std::vector dist = {42.0}; + std::vector pos; + auto h = make_vec_heap(dist, pos); + + h.push(0); + REQUIRE(h.size() == 1u); + CHECK(h.top() == 0u); + h.pop(); + CHECK(h.empty()); +} + +// --------------------------------------------------------------------------- +// decrease-key +// --------------------------------------------------------------------------- + +TEST_CASE("indexed_dary_heap: decrease-key reorders top", "[heap][indexed_dary_heap]") { + std::vector dist = {10, 20, 30, 40}; + std::vector pos; + auto h = make_vec_heap(dist, pos); + + for (unsigned k = 0; k < 4; ++k) h.push(k); + CHECK(h.top() == 0u); + + // Move key 3 to the front by lowering its distance. + dist[3] = 1.0; + h.decrease(3); + CHECK(h.top() == 3u); + + h.pop(); + CHECK(h.top() == 0u); +} + +TEST_CASE("indexed_dary_heap: repeated decrease-key on same key", "[heap][indexed_dary_heap]") { + std::vector dist = {100, 100, 100, 100, 100}; + std::vector pos; + auto h = make_vec_heap(dist, pos); + + for (unsigned k = 0; k < 5; ++k) h.push(k); + + for (double d : {50.0, 25.0, 10.0, 1.0}) { + dist[2] = d; + h.decrease(2); + CHECK(h.top() == 2u); + } +} + +// --------------------------------------------------------------------------- +// contains / clear +// --------------------------------------------------------------------------- + +TEST_CASE("indexed_dary_heap: contains tracks membership", "[heap][indexed_dary_heap]") { + std::vector dist = {1, 2, 3}; + std::vector pos; + auto h = make_vec_heap(dist, pos); + + CHECK_FALSE(h.contains(0)); + CHECK_FALSE(h.contains(1)); + + h.push(0); + h.push(2); + CHECK(h.contains(0)); + CHECK_FALSE(h.contains(1)); + CHECK(h.contains(2)); + + h.pop(); // removes 0 + CHECK_FALSE(h.contains(0)); + CHECK(h.contains(2)); +} + +TEST_CASE("indexed_dary_heap: clear empties and resets positions", "[heap][indexed_dary_heap]") { + std::vector dist = {1, 2, 3, 4}; + std::vector pos; + auto h = make_vec_heap(dist, pos); + + for (unsigned k = 0; k < 4; ++k) h.push(k); + REQUIRE(h.size() == 4u); + + h.clear(); + CHECK(h.empty()); + for (unsigned k = 0; k < 4; ++k) { + CHECK_FALSE(h.contains(k)); + CHECK(pos[k] == vector_position_map::npos); + } +} + +// --------------------------------------------------------------------------- +// push_or_decrease +// --------------------------------------------------------------------------- + +TEST_CASE("indexed_dary_heap: push_or_decrease inserts then decreases", + "[heap][indexed_dary_heap]") { + std::vector dist = {10, 20, 30}; + std::vector pos; + auto h = make_vec_heap(dist, pos); + + // First call inserts. + h.push_or_decrease(1); + CHECK(h.size() == 1u); + CHECK(h.top() == 1u); + + h.push_or_decrease(2); + CHECK(h.size() == 2u); + CHECK(h.top() == 1u); // 20 < 30 + + // Lower key 2 below key 1 — second call should decrease, not duplicate. + dist[2] = 5.0; + h.push_or_decrease(2); + CHECK(h.size() == 2u); + CHECK(h.top() == 2u); +} + +// --------------------------------------------------------------------------- +// Arity 2 +// --------------------------------------------------------------------------- + +TEST_CASE("indexed_dary_heap: arity 2 produces sorted drain", "[heap][indexed_dary_heap]") { + std::vector dist = {5, 2, 7, 1, 4, 9, 3, 8, 6, 0}; + std::vector pos; + auto h = make_vec_heap<2>(dist, pos); + + for (unsigned k = 0; k < dist.size(); ++k) h.push(k); + auto out = drain(h); + + REQUIRE(out.size() == 10u); + for (std::size_t i = 1; i < out.size(); ++i) { + CHECK(dist[out[i - 1]] <= dist[out[i]]); + } +} + +// --------------------------------------------------------------------------- +// Custom comparator: max-heap via std::greater +// --------------------------------------------------------------------------- + +TEST_CASE("indexed_dary_heap: std::greater yields max-heap", "[heap][indexed_dary_heap]") { + std::vector dist = {5, 2, 7, 1, 4}; + std::vector pos; + auto h = make_vec_heap<4, std::greater>(dist, pos); + + for (unsigned k = 0; k < dist.size(); ++k) h.push(k); + // Distances: 0→5, 1→2, 2→7, 3→1, 4→4 ⇒ max-heap order: 2,0,4,1,3 + CHECK(drain(h) == std::vector{2, 0, 4, 1, 3}); +} + +// --------------------------------------------------------------------------- +// assoc_position_map (string keys) +// --------------------------------------------------------------------------- + +TEST_CASE("indexed_dary_heap: assoc_position_map supports string keys", + "[heap][indexed_dary_heap][assoc_map]") { + std::unordered_map dist = { + {"a", 5.0}, {"b", 2.0}, {"c", 7.0}, {"d", 1.0}}; + std::unordered_map pos; + auto distfn = [&dist](const std::string& k) -> const double& { return dist.at(k); }; + + using PMap = assoc_position_map; + indexed_dary_heap, PMap, 4> h( + distfn, std::less{}, PMap{pos}); + + for (const auto& k : {"a", "b", "c", "d"}) h.push(k); + CHECK(drain(h) == std::vector{"d", "b", "a", "c"}); +} + +TEST_CASE("indexed_dary_heap: assoc_position_map decrease-key", "[heap][indexed_dary_heap][assoc_map]") { + std::unordered_map dist = { + {"x", 100.0}, {"y", 50.0}, {"z", 25.0}}; + std::unordered_map pos; + auto distfn = [&dist](const std::string& k) -> const double& { return dist.at(k); }; + + using PMap = assoc_position_map; + indexed_dary_heap, PMap, 4> h( + distfn, std::less{}, PMap{pos}); + + h.push("x"); h.push("y"); h.push("z"); + REQUIRE(h.top() == "z"); + + dist["x"] = 1.0; + h.decrease("x"); + CHECK(h.top() == "x"); + CHECK(h.contains("x")); + CHECK(h.contains("y")); + CHECK(h.contains("z")); + + h.pop(); + CHECK_FALSE(h.contains("x")); +} + +// --------------------------------------------------------------------------- +// Random stress: cross-check monotone drain after mixed decrease-key +// --------------------------------------------------------------------------- + +TEST_CASE("indexed_dary_heap: random stress with decrease-key", + "[heap][indexed_dary_heap][stress]") { + constexpr unsigned N = 1000; + std::mt19937 rng(0xC0FFEE); + + std::vector dist(N); + std::uniform_real_distribution dgen(0.0, 1000.0); + for (auto& d : dist) d = dgen(rng); + + std::vector pos; + auto h = make_vec_heap<4>(dist, pos); + + for (unsigned k = 0; k < N; ++k) h.push(k); + + // 500 random decrease-key ops. + std::uniform_int_distribution kpick(0, N - 1); + for (int i = 0; i < 500; ++i) { + const unsigned k = kpick(rng); + dist[k] *= 0.5; + h.decrease(k); + } + + // Drain and assert monotone. + double prev = -1.0; + unsigned count = 0; + while (!h.empty()) { + const double cur = dist[h.top()]; + CHECK(cur >= prev); + prev = cur; + h.pop(); + ++count; + } + CHECK(count == N); +} diff --git a/tests/algorithms/test_mst.cpp b/tests/algorithms/test_mst.cpp index 8589c02..004ac19 100644 --- a/tests/algorithms/test_mst.cpp +++ b/tests/algorithms/test_mst.cpp @@ -633,3 +633,59 @@ TEMPLATE_TEST_CASE("prim - sparse invalid seed throws", container_value_fn(predecessor)), std::out_of_range); } + +// ============================================================================= +// Prim's Algorithm — indexed d-ary heap parity (Phase 5) +// ============================================================================= + +TEST_CASE("prim - indexed d-ary heap parity", "[algorithm][mst][prim][indexed_heap]") { + using Graph = vov_weighted; + using id_t = vertex_id_t; + + // 8-vertex weighted undirected graph that triggers post-finalization + // re-relaxation (the case that exposed the original Prim correctness bug). + // MST weight = 18 (verified by Kruskal cross-check below). + Graph g({{0, 1, 4}, {1, 0, 4}, {0, 2, 1}, {2, 0, 1}, {1, 2, 2}, {2, 1, 2}, + {1, 3, 5}, {3, 1, 5}, {2, 3, 8}, {3, 2, 8}, {2, 4, 10},{4, 2, 10}, + {3, 4, 2}, {4, 3, 2}, {3, 5, 6}, {5, 3, 6}, {4, 5, 3}, {5, 4, 3}, + {4, 6, 9}, {6, 4, 9}, {5, 6, 7}, {6, 5, 7}, {5, 7, 1}, {7, 5, 1}, + {6, 7, 4}, {7, 6, 4}}); + + const auto N = num_vertices(g); + + auto run = [&](auto heap_tag) { + std::vector predecessor(N); + std::vector weight(N); + init_shortest_paths(g, weight, predecessor); + auto total = prim(g, id_t{0}, + container_value_fn(weight), + container_value_fn(predecessor), + [](const auto& gr, const auto& uv) { return edge_value(gr, uv); }, + std::less(), + heap_tag); + return std::make_tuple(total, predecessor, weight); + }; + + auto [total_def, pred_def, wt_def ] = run(graph::use_default_heap{}); + auto [total_idx4, pred_idx4, wt_idx4] = run(graph::use_indexed_dary_heap<4>{}); + auto [total_idx8, pred_idx8, wt_idx8] = run(graph::use_indexed_dary_heap<8>{}); + + // Cross-check the absolute MST weight against Kruskal on the same edges. + using Edge = simple_edge; + std::vector edges = { + {0, 1, 4}, {0, 2, 1}, {1, 2, 2}, {1, 3, 5}, {2, 3, 8}, {2, 4, 10}, + {3, 4, 2}, {3, 5, 6}, {4, 5, 3}, {4, 6, 9}, {5, 6, 7}, {5, 7, 1}, + {6, 7, 4}}; + std::vector mst; + graph::kruskal(edges, mst); + const int kruskal_weight = total_weight(mst); + + REQUIRE(kruskal_weight == 18); // sanity + REQUIRE(total_def == 18); // Prim default heap matches Kruskal + REQUIRE(total_def == total_idx4); // and matches indexed Idx4 + REQUIRE(total_def == total_idx8); // and matches indexed Idx8 + + // Per-vertex tree-edge weights must agree (predecessor may differ when ties). + REQUIRE(wt_def == wt_idx4); + REQUIRE(wt_def == wt_idx8); +}