DataDog · dougqh · May 15, 2026 · May 15, 2026 · May 15, 2026 · May 15, 2026
@@ -27,29 +27,35 @@
 import org.openjdk.jmh.infra.Blackhole;
 
 /**
- * Adversarial JMH benchmark designed to stress the metrics subsystem's capacity bounds.
+ * Adversarial JMH benchmark designed to stress every cardinality + capacity dimension of the
+ * metrics subsystem at once.
  *
- * <p>The metrics aggregator is bounded at every layer:
+ * <p>The metrics aggregator is supposed to be bounded by design:
  *
  * <ul>
- *   <li>The aggregate cache caps total entries at {@code tracerMetricsMaxAggregates} (default
- *       2048). Beyond that LRU eviction kicks in.
- *   <li>The producer/consumer inbox is a fixed-size MPSC queue ({@code tracerMetricsMaxPending});
- *       when full, producer {@code offer} returns false and the snapshot is dropped via {@link
- *       HealthMetrics#onStatsInboxFull()}.
- *   <li>Histograms use a bounded dense store -- per-histogram memory is fixed.
+ *   <li>{@link AggregateTable} caps total entries at {@code tracerMetricsMaxAggregates} (default
+ *       2048) and rejects further inserts when full.
+ *   <li>Each cardinality handler caps distinct values per reporting cycle; overflow collapses to
+ *       {@code blocked_by_tracer}.
+ *   <li>The producer/consumer inbox is a fixed-size MPSC queue ({@code tracerMetricsMaxPending},
+ *       default 2048); when full, producer {@code offer} returns false and the snapshot is dropped
+ *       via {@link HealthMetrics#onStatsInboxFull()}.
+ *   <li>Histograms use {@code CollapsingLowestDenseStore(1024)} -- bounded per-histogram memory.
+ *   <li>Cardinality handlers are flat open-addressed tables of fixed capacity -- no allocation on
+ *       the producer thread; allocation only on the consumer (handler reset clears, doesn't
+ *       reallocate).
  * </ul>
  *
- * <p>The benchmark hammers all of these simultaneously with 8 producer threads, unique labels per
- * op (so the aggregate cache fills+evicts repeatedly), random durations across a wide range (so
- * histograms accept many distinct bins), and random {@code error}/{@code topLevel} flags (so both
- * histograms are exercised). After the run, drop counters are printed so you can see how the
- * subsystem absorbed the burst.
+ * <p>This benchmark hammers all of those bounds simultaneously with 8 producer threads, unique
+ * labels per op (so handlers cap and the table fills+evicts repeatedly), random durations across a
+ * wide range (so histograms accept many distinct bins), and random {@code error}/{@code topLevel}
+ * flags (so both histograms are exercised). After the run, prints the drop counters so you can
+ * verify the subsystem stayed bounded under attack.
  *
- * <p>What "OOM the metrics subsystem" would look like if the bounds break: producer-thread
- * allocation would grow unbounded (snapshots faster than the inbox can drain produces dropped
- * snapshots, not heap growth); aggregator-thread heap would grow if entries weren't capped or
- * histograms grew past their dense-store limit.
+ * <p>What "OOM the metrics subsystem" looks like if the bounds break: producer-thread allocation
+ * would grow unbounded (snapshots faster than inbox can drain produces dropped snapshots, not heap
+ * growth); aggregator-thread heap would grow if entries weren't capped, if handlers grew past their
+ * cap, or if histograms grew past their dense-store limit.
  */
 @State(Scope.Benchmark)
 @Warmup(iterations = 2, time = 15, timeUnit = SECONDS)
@@ -100,17 +106,17 @@ public void tearDown() {
     System.err.println(
         "  onStatsAggregateDropped  = "
             + health.aggregateDropped.sum()
-            + "   (snapshots dropped because the aggregate cache was full with no stale entry)");
+            + "   (snapshots dropped because the AggregateTable was full with no stale entry)");
   }
 
   @Benchmark
   public void publish(ThreadState ts, Blackhole blackhole) {
     int idx = ts.cursor++;
     ThreadLocalRandom rng = ThreadLocalRandom.current();
 
-    // Mix indices so labels don't fall into linear order. Distinct labels exceed every reasonable
-    // working-set bound, so the aggregate cache evicts continuously and most ops force a fresh
-    // MetricKey construction on the consumer thread.
+    // Mix indices so labels don't fall into linear order in the handler tables. Distinct labels
+    // exceed every cap (RESOURCE=512, OPERATION=128, SERVICE=128, peer.hostname=512), so handlers
+    // saturate fast and most ops resolve to the blocked-by-tracer sentinel.
     int scrambled = idx * 0x9E3779B1; // golden ratio multiplier
     String service = "svc-" + (scrambled & 0xFFFF);
     String operation = "op-" + ((scrambled >>> 8) & 0x3FFFF);

@@ -0,0 +1,83 @@
+package datadog.trace.common.metrics;
+
+import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND;
+import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_CLIENT;
+import static java.util.concurrent.TimeUnit.MICROSECONDS;
+import static java.util.concurrent.TimeUnit.SECONDS;
+
+import datadog.communication.ddagent.DDAgentFeaturesDiscovery;
+import datadog.trace.api.WellKnownTags;
+import datadog.trace.core.CoreSpan;
+import datadog.trace.core.monitor.HealthMetrics;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Miss-path variant of {@link ClientStatsAggregatorBenchmark}. Each op publishes a single-span
+ * trace from a pre-built pool where every span has a unique (service, operation, resource) tuple.
+ * After cardinality budgets fill, fields canonicalize to the {@code blocked_by_tracer} sentinel,
+ * but the producer still allocates a {@link SpanSnapshot} per op and enqueues it for the aggregator
+ * -- so the steady state exercises the per-op publish allocations + the consumer's
+ * canonicalize/match work, not the hit-path-only pattern of the other benchmarks.
+ *
+ * <p>Run with {@code -prof gc} to compare allocation rates against master's {@code
+ * ConflatingMetricsAggregator}.
+ */
+@State(Scope.Benchmark)
+@Warmup(iterations = 1, time = 15, timeUnit = SECONDS)
+@Measurement(iterations = 3, time = 15, timeUnit = SECONDS)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(MICROSECONDS)
+@Fork(value = 1)
+public class ClientStatsAggregatorMissPathBenchmark {
+
+  private static final int POOL_SIZE = 4096;
+
+  private final DDAgentFeaturesDiscovery featuresDiscovery =
+      new ClientStatsAggregatorBenchmark.FixedAgentFeaturesDiscovery(
+          Collections.singleton("peer.hostname"), Collections.emptySet());
+  private final ClientStatsAggregator aggregator =
+      new ClientStatsAggregator(
+          new WellKnownTags("", "", "", "", "", ""),
+          Collections.emptySet(),
+          featuresDiscovery,
+          HealthMetrics.NO_OP,
+          new ClientStatsAggregatorBenchmark.NullSink(),
+          2048,
+          2048,
+          false);
+
+  private final List<List<CoreSpan<?>>> pool = generatePool(POOL_SIZE);
+  private int cursor;
+
+  static List<List<CoreSpan<?>>> generatePool(int n) {
+    List<List<CoreSpan<?>>> out = new ArrayList<>(n);
+    for (int i = 0; i < n; i++) {
+      SimpleSpan span =
+          new SimpleSpan(
+              "svc-" + i, "op-" + i, "res-" + i, "type-" + (i & 7), true, true, false, 0, 10, -1);
+      span.setTag(SPAN_KIND, SPAN_KIND_CLIENT);
+      span.setTag("peer.hostname", "host-" + i);
+      out.add(Collections.singletonList(span));
+    }
+    return out;
+  }
+
+  @Benchmark
+  public void benchmark(Blackhole blackhole) {
+    int idx = cursor;
+    cursor = (idx + 1) % POOL_SIZE;
+    blackhole.consume(aggregator.publish(pool.get(idx)));
+  }
+}
@@ -0,0 +1,176 @@
+package datadog.trace.common.metrics;
+
+import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND;
+import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_CLIENT;
+import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_SERVER;
+import static java.util.concurrent.TimeUnit.SECONDS;
+
+import datadog.trace.api.WellKnownTags;
+import datadog.trace.bootstrap.instrumentation.api.AgentScope;
+import datadog.trace.bootstrap.instrumentation.api.AgentSpan;
+import datadog.trace.common.writer.Writer;
+import datadog.trace.core.CoreTracer;
+import datadog.trace.core.DDSpan;
+import datadog.trace.core.monitor.HealthMetrics;
+import java.lang.reflect.Field;
+import java.util.Collections;
+import java.util.List;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * End-to-end JMH benchmark of a 3-span HTTP-style trace through {@link CoreTracer}: one {@code
+ * span.kind=server} root + two {@code span.kind=client} children, as if a service handled an
+ * incoming request that made two outbound HTTP calls. Children inherit the server span as parent
+ * via implicit scope-based parentage; the root finishes last so {@code PendingTrace.write} ->
+ * {@code tracer.write(trace)} -> metricsAggregator.publish + writer.write (no-op) runs
+ * synchronously on the producing thread.
+ *
+ * <p>Runs multi-threaded ({@link Threads} = 8 by default; override with {@code -t N}) so the
+ * allocation rate {@code -prof gc} reports reflects multiple producers hitting the shared metrics
+ * aggregator + writer pipeline, and so we can compare total throughput between revisions.
+ *
+ * <p>Reflection is used to swap the tracer's default no-op {@code metricsAggregator} for a real
+ * {@link ClientStatsAggregator} so the metrics pipeline actually runs.
+ *
+ * <p>Two modes via {@code @Param}:
+ *
+ * <ul>
+ *   <li>{@code stable} -- every op uses the same labels (cache-hit path on the consumer).
+ *   <li>{@code varied} -- every op uses unique service / operation / resource per span (miss path
+ *       until cardinality budgets fill, then sentinel collapse).
+ * </ul>
+ */
+@State(Scope.Benchmark)
+@Warmup(iterations = 2, time = 15, timeUnit = SECONDS)
+@Measurement(iterations = 5, time = 15, timeUnit = SECONDS)
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(SECONDS)
+@Threads(8)
+@Fork(value = 2)
+public class TracePipelineBenchmark {
+
+  @Param({"stable", "varied"})
+  String mode;
+
+  private CoreTracer tracer;
+  private ClientStatsAggregator aggregator;
+  private boolean stable;
+
+  @State(Scope.Thread)
+  public static class ThreadState {
+    int cursor;
+  }
+
+  @Setup
+  public void setup() throws Exception {
+    this.stable = "stable".equals(mode);
+    this.tracer = CoreTracer.builder().writer(new NoopWriter()).strictTraceWrites(false).build();
+    this.aggregator =
+        new ClientStatsAggregator(
+            new WellKnownTags("", "", "", "", "", ""),
+            Collections.emptySet(),
+            new ClientStatsAggregatorBenchmark.FixedAgentFeaturesDiscovery(
+                Collections.singleton("peer.hostname"), Collections.emptySet()),
+            HealthMetrics.NO_OP,
+            new ClientStatsAggregatorBenchmark.NullSink(),
+            2048,
+            2048,
+            false);
+    this.aggregator.start();
+    // Replace the no-op aggregator the tracer was constructed with. The field is package-private
+    // in datadog.trace.core; reflect since this benchmark lives in the metrics package.
+    Field f = CoreTracer.class.getDeclaredField("metricsAggregator");
+    f.setAccessible(true);
+    f.set(this.tracer, this.aggregator);
+  }
+
+  @TearDown
+  public void tearDown() {
+    aggregator.close();
+    tracer.close();
+  }
+
+  @Benchmark
+  public void threeSpanTrace(ThreadState ts, Blackhole blackhole) {
+    int idx = ts.cursor++;
+    String service = stable ? "svc" : "svc-" + idx;
+    String serverOp = stable ? "servlet.request" : "servlet.request-" + idx;
+    String serverResource = stable ? "GET /widgets/{id}" : "GET /widgets/" + idx;
+    String clientOp = stable ? "http.request" : "http.request-" + idx;
+    String clientResource1 = stable ? "GET /downstream-a" : "GET /downstream-a/" + idx;
+    String clientResource2 = stable ? "GET /downstream-b" : "GET /downstream-b/" + idx;
+    String hostA = stable ? "host-a" : "host-a-" + idx;
+    String hostB = stable ? "host-b" : "host-b-" + idx;
+
+    AgentSpan server = tracer.startSpan("servlet", serverOp);
+    server.setResourceName(serverResource);
+    server.setServiceName(service);
+    server.setTag(SPAN_KIND, SPAN_KIND_SERVER);
+    AgentScope serverScope = tracer.activateSpan(server);
+    try {
+      AgentSpan client1 = tracer.startSpan("okhttp", clientOp);
+      client1.setResourceName(clientResource1);
+      client1.setServiceName(service);
+      client1.setTag(SPAN_KIND, SPAN_KIND_CLIENT);
+      client1.setTag("peer.hostname", hostA);
+      AgentScope client1Scope = tracer.activateSpan(client1);
+      try {
+        // simulated unit of in-call work would go here
+      } finally {
+        client1Scope.close();
+      }
+      client1.finish();
+
+      AgentSpan client2 = tracer.startSpan("okhttp", clientOp);
+      client2.setResourceName(clientResource2);
+      client2.setServiceName(service);
+      client2.setTag(SPAN_KIND, SPAN_KIND_CLIENT);
+      client2.setTag("peer.hostname", hostB);
+      AgentScope client2Scope = tracer.activateSpan(client2);
+      try {
+        // simulated unit of in-call work would go here
+      } finally {
+        client2Scope.close();
+      }
+      client2.finish();
+    } finally {
+      serverScope.close();
+    }
+    // Finishing the root last triggers PendingTrace.write -> tracer.write -> metrics + writer on
+    // this thread, since all child refs have already decremented to zero.
+    server.finish();
+    blackhole.consume(server);
+  }
+
+  private static final class NoopWriter implements Writer {
+    @Override
+    public void write(List<DDSpan> trace) {}
+
+    @Override
+    public void start() {}
+
+    @Override
+    public boolean flush() {
+      return true;
+    }
+
+    @Override
+    public void close() {}
+
+    @Override
+    public void incrementDropCounts(int spanCount) {}
+  }
+}