quickwit-oss · fulmicoton · May 6, 2026
@@ -2723,6 +2723,7 @@ mod tests {
                     scroll_id: None,
                     failed_splits: Vec::new(),
                     num_successful_splits: 1,
+                    resource_stats: None,
                 })
             });
 

@@ -319,6 +319,11 @@ message SearchResponse {
 
   // Total number of successful splits searched.
   uint64 num_successful_splits = 8;
+
+  // Per-request execution telemetry. See `RootResourceStats` for the
+  // execution model and a diagnostic playbook. Optional for backwards
+  // compatibility — older servers / clients may omit this.
+  optional RootResourceStats resource_stats = 10;
 }
 
 message SearchPlanResponse {
@@ -355,12 +360,282 @@ message LeafSearchRequest {
   repeated string index_uris = 9;
 }
 
+// Per-leaf execution stats for a single leaf search response.
+//
+// All fields are cheap to compute (counters and `Instant::now()` calls
+// already on the hot path). Aggregated by the root into `RootResourceStats`
+// — see that message for the request-scoped view and the diagnostic
+// playbook.
 message ResourceStats {
+    // Bytes pulled into the per-request short-lived cache during warmup.
+    // Proxy for "bytes downloaded from object storage" by this leaf.
+    //
+    // The proxy is inaccurate because of some of the data could have been in cache.
     uint64 short_lived_cache_num_bytes = 1;
+    // Total number of documents in the splits the leaf considered for search.
+    // (after timestamp/tag pruning).
     uint64 split_num_docs = 2;
+    // Wall time spent in the warmup phase (downloading metadata + hot bytes).
     uint64 warmup_microsecs = 3;
+    // Time spent queued waiting for a permit on the search CPU thread pool.
+    // Reflects pool saturation, not the cost of the query itself.
     uint64 cpu_thread_pool_wait_microsecs = 4;
+    // CPU time spent on the search CPU thread pool — matching, scoring,
+    // top-k collection, and aggregation collection combined.
     uint64 cpu_microsecs = 5;
+    // CPU time spent specifically inside the aggregation collector,
+    // measured around `AggregationSegmentCollector::collect_block` calls.
+    // Zero when the request has no aggregation.
+    uint64 aggregation_cpu_microsecs = 6;
+    // Wall clock time for the entire leaf request, measured from the
+    // entry of `multi_index_leaf_search` to the moment the merged
+    // response is ready to return. Includes I/O permit waits, warmup
+    // (parallel across splits), CPU permit queueing, search execution,
+    // and the per-leaf merge.
+    //
+    // This is set ONCE per leaf request — the per-split path does not
+    // populate it, so `merge_resource_stats` summing across per-split
+    // responses leaves it at zero, and the leaf-level handler then
+    // overwrites with the actual wall time.
+    //
+    // The root takes the max and sum of this field across leaves to
+    // populate `RootResourceStats.max_leaf_wall_microsecs` and
+    // `sum_leaf_wall_microsecs` — i.e. the slowest leaf and the basis
+    // for an imbalance ratio.
+    uint64 wall_microsecs = 7;
+}
+
+// =============================================================================
+// RootResourceStats — execution telemetry for a single search request.
+// =============================================================================
+//
+// Aggregate of cheap-to-compute counters and timers, summed/maxed across all
+// leaf responses participating in a search, plus a few root-side measurements
+// (merge time, leaf count). The intent is to give an operator (or an AI agent)
+// enough signal to attribute slowness to a specific stage of the search
+// pipeline and to estimate whether the implementation has headroom.
+//
+// This is the request-scoped sibling of the per-leaf `ResourceStats`. Where
+// `ResourceStats` describes one leaf's view of one or more splits,
+// `RootResourceStats` describes the whole request as observed from the root.
+//
+// -----------------------------------------------------------------------------
+// Quickwit search execution model (one request, end-to-end)
+// -----------------------------------------------------------------------------
+//
+// A search request flows through the following stages. Each stage has at
+// least one field below that exposes its cost.
+//
+//   1. Root planning. The root server resolves the index metadata, picks the
+//      splits to search (timestamp/tag pruning), groups them by leaf, and
+//      issues a `LeafSearchRequest` to every leaf. The cost of this phase is
+//      *not* covered by RootResourceStats today (typically negligible).
+//
+//   2. Leaf-side, per-split execution. Each leaf processes its assigned
+//      splits concurrently. Per split the lifecycle is:
+//        a. Acquire an I/O permit (admission control on concurrent
+//           downloads).
+//        b. WARMUP — download all required data from object storage (split
+//           metadata, posting list headers, term dictionary blocks needed
+//           by the query, fast-field headers). These bytes land in a
+//           per-request "short-lived cache" so subsequent reads are local.
+//        c. Acquire a CPU permit on the search CPU thread pool. Until a
+//           permit is granted the task is *queued* — this queueing time is
+//           recorded as `cpu_thread_pool_wait_microsecs`.
+//        d. Execute the query on the search CPU thread pool: matching/scoring,
+//           top-k collection, and (if requested) aggregation. CPU time
+//           spent here is `cpu_microsecs`.
+//           Time spent specifically inside the aggregation collector is
+//           `aggregation_cpu_microsecs`.
+//           The thread pool is sized to match the number of hyperthreads
+//           available, so that this is very close to the actual CPU usage.
+//        e. Return a partial result.
+//
+//   3. Leaf-side merge. Each leaf merges its per-split partial results
+//      into a single `LeafSearchResponse` (top-k merge, aggregation tree
+//      merge).
+//
+//   4. Root merge. The root collects every `LeafSearchResponse` and merges
+//      them into the final `SearchResponse`. The wall time of this stage
+//      is `root_merge_microsecs`.
+//
+// -----------------------------------------------------------------------------
+// Interpreting these fields — diagnostic playbook
+// -----------------------------------------------------------------------------
+//
+// Let:
+//   wall          := elapsed_time_micros (top-level field on SearchResponse)
+//   cpu           := total_cpu_microsecs
+//   wait          := total_cpu_thread_pool_wait_microsecs
+//   warmup        := total_warmup_microsecs
+//   agg_cpu       := total_aggregation_cpu_microsecs
+//   max_leaf      := max_leaf_wall_microsecs
+//   avg_leaf      := sum_leaf_wall_microsecs / num_leaf_responses
+//   downloaded    := total_short_lived_cache_num_bytes
+//   docs_in_split := total_split_num_docs
+//   docs_matched  := total_num_docs_matched
+//
+// Suggested rules of thumb (orient triage, not strict thresholds):
+//
+//   * CPU-bound query:      cpu  >~ 0.5 * (wall * num_search_cpu_threads)
+//                           and warmup small relative to wall.
+//                           => look at the query plan; check for
+//                              unselective predicates, range queries on
+//                              tokenized fields, phrase queries with
+//                              frequent terms.
+//
+//   * I/O-bound query:      warmup >~ 0.5 * wall, or `downloaded` is large
+//                           (hundreds of MB / GBs).
+//                           => cold cache, large fast fields, many splits,
+//                              or first-time access patterns. Check whether
+//                              a different fast-field layout or a smaller
+//                              time range would help.
+//
+//   * Cluster overloaded:   wait >~ cpu (waiting longer than computing).
+//                           => the search CPU pool is saturated by other
+//                              requests; the query itself may be fine.
+//                              Re-run in isolation to confirm.
+//
+//   * Aggregation-heavy:    agg_cpu >~ 0.5 * cpu.
+//                           => aggregation is the bottleneck, not matching.
+//                              Look at bucket cardinality, sub-aggregations,
+//                              or unbounded `terms` aggregations.
+//
+//   * Imbalance across leaves:  (max_leaf - avg_leaf) / avg_leaf >~ 1.
+//                           => request fan-out is uneven. Common causes:
+//                              one leaf has many more splits, one leaf has
+//                              a colder cache, or other workload is
+//                              starving one leaf's CPU pool.
+//
+//   * Too much work:         docs_matched >~ docs_in_split * 0.5.
+//                           => query is barely selective; consider tighter
+//                              filters, time pruning, or sample-based
+//                              approaches.
+//
+//   * Headroom estimate:     wall * num_search_cpu_threads - cpu - wait
+//                           ~ idle CPU capacity unused by this request.
+//                           If positive and large, parallelism is the
+//                           limiting factor (more splits, more leaves).
+//                           If near zero, we are at compute saturation.
+//
+// All values are sums/maxes over leaf responses unless noted. They are
+// designed to be cheap (counters and `Instant::now()` calls already in the
+// hot path) so they can be returned with every search response.
+// -----------------------------------------------------------------------------
+message RootResourceStats {
+    // -------------------------------------------------------------------------
+    // Sums across leaf responses
+    // -------------------------------------------------------------------------
+
+    // Sum of `cpu_microsecs` reported by every leaf — total wall time
+    // spent on the search CPU thread pool, summed across all leaves.
+    //
+    // Compare against `wall * num_search_cpu_threads_per_leaf *
+    // num_leaf_responses` to estimate CPU utilization. A value close to
+    // that upper bound means the request was compute-saturated.
+    //
+    // Includes time spent matching, scoring, top-k collection, and
+    // aggregation collection. Does NOT include warmup or queueing.
+    uint64 total_cpu_microsecs = 1;
+
+    // Sum of `warmup_microsecs` across leaves.
+    //
+    // Warmup is the phase where the leaf downloads, from object storage,
+    // the bytes it needs to evaluate the query (split metadata, posting
+    // list / dictionary blocks, fast-field headers). It runs before the
+    // query is dispatched to the search CPU pool.
+    //
+    // High values relative to `total_cpu_microsecs` signal an I/O-bound
+    // request — cold short-lived cache, large fast fields, high split
+    // count, or cross-region object storage latency.
+    uint64 total_warmup_microsecs = 2;
+
+    // Sum of `cpu_thread_pool_wait_microsecs` across leaves.
+    //
+    // Time leaves spent *queued*, waiting for a permit on the search CPU
+    // thread pool after warmup completed. This is admission-control
+    // queueing, not work.
+    //
+    // If this is comparable to or larger than `total_cpu_microsecs`, the
+    // cluster is CPU-saturated by other workloads — the query itself may
+    // be fine. Re-run in isolation to confirm.
+    uint64 total_cpu_thread_pool_wait_microsecs = 3;
+
+    // CPU time spent specifically inside the aggregation collector,
+    // summed across leaves. Zero when the request has no aggregation.
+    //
+    // Measured by sampling `Instant::now()` around each call to
+    // `AggregationSegmentCollector::collect_block` (block size <= 2048
+    // docs). Overhead is well below noise (~0.15 ns per matched doc).
+    //
+    // Compare to `total_cpu_microsecs` to get the fraction of CPU spent
+    // in aggregation. A high fraction (>~ 50%) points at heavy
+    // bucketization (terms with high cardinality, deep sub-aggregations)
+    // rather than at predicate matching.
+    uint64 total_aggregation_cpu_microsecs = 4;
+
+    // Sum of `short_lived_cache_num_bytes` across leaves — the size of
+    // the per-request hot-byte cache populated during warmup. A reasonable
+    // proxy for "GB downloaded from object storage for this query".
+    //
+    // Useful to flag I/O-heavy queries even when `total_warmup_microsecs`
+    // is hidden by parallelism (many splits, each downloading concurrently).
+    uint64 total_short_lived_cache_num_bytes = 5;
+
+    // Sum of `split_num_docs` across leaves — the total number of
+    // documents in all splits searched, after timestamp/tag pruning at
+    // the root and any further pruning at the leaf.
+    //
+    // This is the size of the haystack the query had to traverse. Pair
+    // with `total_num_docs_matched` to gauge selectivity.
+    uint64 total_split_num_docs = 9;
+
+    // Sum across leaves of the number of docs that matched the query
+    // (i.e. were passed to the collector via `collect_block` /
+    // `collect`). For non-aggregation queries this equals the global
+    // `num_hits` on `SearchResponse`; for aggregation queries it is the
+    // raw match count, which `num_hits` may not surface.
+    //
+    // Selectivity = `total_num_docs_matched / total_split_num_docs`.
+    // A value near 1 means the query is barely filtering and the cost
+    // is dominated by traversal of essentially every doc.
+    uint64 total_num_docs_matched = 10;
+
+    // -------------------------------------------------------------------------
+    // Aggregates over leaves (for imbalance detection)
+    // -------------------------------------------------------------------------
+
+    // Number of leaf responses folded into this stats object. Useful as
+    // the denominator when computing per-leaf averages from the sums above.
+    uint64 num_leaf_responses = 6;
+
+    // Wall time of the slowest leaf. This is a tight lower bound on the
+    // request's wall time (modulo root merge) because the root must wait
+    // for every leaf.
+    //
+    // If `max_leaf_wall_microsecs ~= elapsed_time_micros`, optimization
+    // effort should focus on the slowest leaf, not on parallelism.
+    uint64 max_leaf_wall_microsecs = 7;
+
+    // Sum of leaf wall times across all responding leaves. Combined with
+    // `num_leaf_responses` and `max_leaf_wall_microsecs`, lets the caller
+    // derive the imbalance ratio:
+    //     avg = sum_leaf_wall_microsecs / num_leaf_responses
+    //     imbalance = (max_leaf_wall_microsecs - avg) / avg
+    // A high imbalance indicates skew — uneven split assignment, cold
+    // cache on one leaf, or competing workload pinning a single leaf's
+    // CPU pool.
+    uint64 sum_leaf_wall_microsecs = 8;
+
+    // -------------------------------------------------------------------------
+    // Root-side instrumentation
+    // -------------------------------------------------------------------------
+
+    // Wall time spent in the root's merge step — combining leaf top-k
+    // results and intermediate aggregation results into the final
+    // `SearchResponse`. Typically small for plain searches; non-trivial
+    // for heavy aggregations (large bucket trees, deep sub-aggregations).
+    uint64 root_merge_microsecs = 11;
 }
 
 // LeafRequestRef references data in LeafSearchRequest to deduplicate data.