From 323b8c19d8c1107cee9ce889dc65a8854908ffc1 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Wed, 10 Jun 2026 18:35:01 -0600
Subject: [PATCH] =?UTF-8?q?feat(fuzz):=20coverage=20carries=20distribution?=
 =?UTF-8?q?s=20=E2=80=94=20score/dimension/latency=20spread=20+=20per-cell?=
 =?UTF-8?q?=20cost=20(0.91.0)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A bare mean hides outliers. CoverageCell.score and every dimension are now full
Distributions (mean/median/p90/min/max/n); evaluation latency is engine-measured
per run (consumer latencyMs overrides) and aggregates per cell and capsule-wide;
per-cell cost splits known dollars from tracked-but-unknown runs (absent when
tracking is unwired — never a fabricated $0). Capsule stats replace the bare
meanRobustness with a robustness Distribution over per-cell means (cells weigh
equally — variance steering would bias a run-weighted average low) plus a
latency Distribution over all runs. HTML tiles color by mean and carry
median/min/latency/cost in the tooltip; KPIs add cell spread + median latency.
---
 clients/python/pyproject.toml                 |  2 +-
 clients/python/src/agent_eval_rpc/__init__.py |  2 +-
 examples/fuzz-legal-demo.mjs                  | 11 +--
 package.json                                  |  2 +-
 src/fuzz/capsule.ts                           | 28 ++++---
 src/fuzz/cube.ts                              | 73 +++++++++++++++----
 src/fuzz/explorer.ts                          | 24 ++++--
 src/fuzz/fuzz-agent.test.ts                   | 72 ++++++++++++++++--
 src/fuzz/types.ts                             | 37 ++++++++--
 9 files changed, 200 insertions(+), 51 deletions(-)

diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index 5477b33..e411b1f 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "agent-eval-rpc"
-version = "0.90.1"
+version = "0.91.0"
 description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/clients/python/src/agent_eval_rpc/__init__.py b/clients/python/src/agent_eval_rpc/__init__.py
index b325ae7..8d49638 100644
--- a/clients/python/src/agent_eval_rpc/__init__.py
+++ b/clients/python/src/agent_eval_rpc/__init__.py
@@ -58,7 +58,7 @@
 try:
     __version__ = version("agent-eval-rpc")
 except PackageNotFoundError:
-    __version__ = "0.90.1"
+    __version__ = "0.91.0"
 
 __all__ = [
     "Client",
diff --git a/examples/fuzz-legal-demo.mjs b/examples/fuzz-legal-demo.mjs
index 76f5387..d842ef4 100644
--- a/examples/fuzz-legal-demo.mjs
+++ b/examples/fuzz-legal-demo.mjs
@@ -100,17 +100,18 @@ console.log(JSON.stringify({
   totalRuns: s.totalRuns,
   cells: `${s.cellsCovered}/${s.cellsTotal}`,
   behaviorBinsObserved: s.behaviorBinsObserved,
-  meanRobustness: +s.meanRobustness.toFixed(3),
+  robustness: s.robustness && { mean: +s.robustness.mean.toFixed(3), median: +s.robustness.median.toFixed(3), min: +s.robustness.min.toFixed(3), max: +s.robustness.max.toFixed(3) },
+  medianLatencyMs: s.latencyMs && Math.round(s.latencyMs.median),
   candidateFindings: s.candidateFindings,
   verifiedFindings: s.verifiedFindings,
   weakestCells: capsule.coverage
-    .filter((c) => c.robustness != null)
-    .sort((a, b) => a.robustness - b.robustness)
+    .filter((c) => c.score != null)
+    .sort((a, b) => a.score.mean - b.score.mean)
     .slice(0, 4)
     .map((c) => ({
       cell: Object.values(c.cell.coords).join('·'),
-      robustness: +c.robustness.toFixed(2),
-      weakestDim: Object.entries(c.dimensions).sort((a, b) => a[1] - b[1])[0]?.[0],
+      score: { mean: +c.score.mean.toFixed(2), min: +c.score.min.toFixed(2) },
+      weakestDim: Object.entries(c.dimensions).sort((a, b) => a[1].mean - b[1].mean)[0]?.[0],
     })),
   topFinding: capsule.findings[0]
     ? {
diff --git a/package.json b/package.json
index 7ec54ac..b336786 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.90.1",
+  "version": "0.91.0",
   "description": "Evaluate and improve AI agents from runs, traces, judges, and feedback. Compare candidates, cluster failures, measure lift, and gate releases.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {
diff --git a/src/fuzz/capsule.ts b/src/fuzz/capsule.ts
index ef7287c..56bbbf7 100644
--- a/src/fuzz/capsule.ts
+++ b/src/fuzz/capsule.ts
@@ -9,8 +9,8 @@
  */
 
 import type { EvalRecord } from './cube'
-import { buildCoverage } from './cube'
-import type { ArchiveEntry, CapsuleData, Cell, CoverageCell, Finding } from './types'
+import { buildCoverage, distribution } from './cube'
+import type { ArchiveEntry, CapsuleData, Cell, CoverageCell, Distribution, Finding } from './types'
 
 export interface BuildCapsuleInput<S> {
   target: string
@@ -35,8 +35,11 @@ export interface BuildCapsuleInput<S> {
 export function buildCapsule<S>(input: BuildCapsuleInput<S>): CapsuleData<S> {
   const coverage = buildCoverage(input.cells, input.log, input.threshold)
   const covered = coverage.filter((c) => c.runs > 0)
-  const meanRobustness =
-    covered.length === 0 ? 0 : covered.reduce((a, c) => a + (c.robustness ?? 0), 0) / covered.length
+  // Cells weigh equally: variance steering sends more runs to weak cells, so a
+  // run-weighted average would bias the headline low.
+  const robustness =
+    covered.length === 0 ? null : distribution(covered.map((c) => (c.score as Distribution).mean))
+  const latencyMs = input.log.length === 0 ? null : distribution(input.log.map((r) => r.latencyMs))
   // Measured-descriptor bins beyond the bare input cell — observed, never planned.
   const behaviorBinsObserved = input.archive.filter((e) => e.binId !== e.cell.id).length
 
@@ -53,7 +56,8 @@ export function buildCapsule<S>(input: BuildCapsuleInput<S>): CapsuleData<S> {
       behaviorBinsObserved,
       candidateFindings: input.candidateFindings,
       verifiedFindings: input.findings.length,
-      meanRobustness,
+      robustness,
+      latencyMs,
       ...(input.cost
         ? { costUsd: input.cost.costUsd, costUnknownRuns: input.cost.costUnknownRuns }
         : {}),
@@ -99,7 +103,7 @@ function deriveAxes(coverage: CoverageCell[]): Array<{ name: string; values: str
 
 /** The weakest dimension chip for a cell, e.g. `safety 32%` — shown when scores exist. */
 function weakestDim(c: CoverageCell): string {
-  const entries = Object.entries(c.dimensions)
+  const entries = Object.entries(c.dimensions).map(([k, d]) => [k, d.mean] as [string, number])
   if (entries.length === 0) return ''
   const sorted = entries.sort((a, b) => a[1] - b[1])
   const w = sorted[0]
@@ -111,9 +115,9 @@ function heatmapHtml(coverage: CoverageCell[]): string {
   const axes = deriveAxes(coverage)
   const byId = new Map(coverage.map((c) => [c.cell.id, c]))
   const tile = (c: CoverageCell | undefined, label: string): string => {
-    const r = c?.robustness ?? null
-    const title = c
-      ? `${pct(r ?? 0)} robust · ${c.runs} runs · ${pct(c.findingRate)} flagged`
+    const r = c?.score?.mean ?? null
+    const title = c?.score
+      ? `${pct(c.score.mean)} robust (median ${pct(c.score.median)}, min ${pct(c.score.min)}) · ${c.runs} runs · ${pct(c.findingRate)} flagged${c.latencyMs ? ` · ${(c.latencyMs.median / 1000).toFixed(1)}s median` : ''}${c.costUsd !== undefined ? ` · $${c.costUsd.toFixed(2)}` : ''}`
       : 'not covered'
     return `<div class="tile" style="background:${robustnessColor(r)}" title="${esc(title)}">${label ? `<span class="tl">${esc(label)}</span>` : ''}<span class="tv">${c && r != null ? pct(r) : '—'}</span>${c ? weakestDim(c) : ''}</div>`
   }
@@ -136,7 +140,7 @@ function heatmapHtml(coverage: CoverageCell[]): string {
     return `<div class="axis-label">rows: <b>${esc(rowAxis.name)}</b> · cols: <b>${esc(colAxis.name)}</b></div><table class="heat">${head}${rows}</table>`
   }
 
-  const sorted = [...coverage].sort((a, b) => (a.robustness ?? 2) - (b.robustness ?? 2))
+  const sorted = [...coverage].sort((a, b) => (a.score?.mean ?? 2) - (b.score?.mean ?? 2))
   return `<div class="grid">${sorted.map((c) => tile(c, Object.values(c.cell.coords).join(' · '))).join('')}</div>`
 }
 
@@ -236,7 +240,9 @@ table.heat th.rh{text-align:right}
 <h1>${esc(capsule.objective)} exploration · ${esc(capsule.target)}</h1>
 <div class="sub">${s.totalRuns} scenarios across ${s.cellsCovered}/${s.cellsTotal} planned cells${s.behaviorBinsObserved > 0 ? ` · ${s.behaviorBinsObserved} measured behavior bins` : ''}${stamp ? ` · ${esc(stamp)}` : ''}</div>
 <div class="kpis">
-${kpi('mean robustness', pct(s.meanRobustness), s.meanRobustness < 0.6 ? '#e58a96' : '#5ad17a')}
+${s.robustness ? kpi('robustness', `${pct(s.robustness.mean)}`, s.robustness.mean < 0.6 ? '#e58a96' : '#5ad17a') : ''}
+${s.robustness ? kpi('cell spread', `${pct(s.robustness.min)}–${pct(s.robustness.max)}`) : ''}
+${s.latencyMs ? kpi('median latency', `${(s.latencyMs.median / 1000).toFixed(1)}s`, s.latencyMs.p90 > 4 * s.latencyMs.median ? '#e5b566' : '#e6e6e6') : ''}
 ${kpi('verified findings', String(s.verifiedFindings), s.verifiedFindings > 0 ? '#e58a96' : '#5ad17a')}
 ${kpi('cells covered', `${s.cellsCovered}/${s.cellsTotal}`)}
 ${kpi('scenarios run', String(s.totalRuns))}
diff --git a/src/fuzz/cube.ts b/src/fuzz/cube.ts
index 8d980cc..6f24c97 100644
--- a/src/fuzz/cube.ts
+++ b/src/fuzz/cube.ts
@@ -3,12 +3,13 @@
  *
  * Cells are the cartesian product of the input axes — the stratification plan,
  * enumerable up front so the planned-vs-covered denominator is honest. Coverage
- * is projected from the evaluation log: per cell, mean headline robustness, the
- * mean of each scored dimension (so the map shows WHICH dimension is weak), and
- * the rate at which the active objective flagged a candidate.
+ * is projected from the evaluation log: per cell, the full DISTRIBUTION of the
+ * headline score, of each scored dimension, and of evaluation latency — a bare
+ * mean hides outliers, so every aggregate carries its spread. Per-cell cost is
+ * split known-dollars vs unknown-runs, never folded into a fabricated $0.
  */
 
-import type { BehaviorSpace, Cell, CoverageCell, Evaluation } from './types'
+import type { BehaviorSpace, Cell, CoverageCell, Distribution, Evaluation } from './types'
 
 /** One recorded evaluation — the unit coverage and the capsule are built from. */
 export interface EvalRecord {
@@ -16,6 +17,11 @@ export interface EvalRecord {
   ev: Evaluation
   /** The objective's interest score for this evaluation. */
   interest: number
+  /** Evaluation wall-clock — engine-measured unless `ev.latencyMs` overrode it. */
+  latencyMs: number
+  /** Known dollars for this run. `null` = cost tracking was wired but this
+   *  run's cost was unknowable (counted apart). Absent = not tracked at all. */
+  costUsd?: number | null
 }
 
 /** Enumerate every input cell (cartesian product of the axes), in stable order. */
@@ -37,12 +43,32 @@ export function cellId(space: BehaviorSpace, coords: Record<string, string>): st
   return space.axes.map((a) => `${a.name}=${coords[a.name]}`).join('|')
 }
 
-const mean = (xs: number[]): number =>
-  xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length
+/** Nearest-rank percentile on a pre-sorted ascending sample. */
+function percentile(sorted: number[], p: number): number {
+  const idx = Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))
+  return sorted[idx] as number
+}
+
+/** Summarize a sample. Throws on an empty sample — callers represent "no data"
+ *  as `null`, never as a zeroed distribution. */
+export function distribution(values: number[]): Distribution {
+  if (values.length === 0)
+    throw new Error('distribution: empty sample — represent missing data as null, not zeros')
+  const sorted = [...values].sort((a, b) => a - b)
+  const mean = sorted.reduce((a, b) => a + b, 0) / sorted.length
+  return {
+    mean,
+    median: percentile(sorted, 0.5),
+    p90: percentile(sorted, 0.9),
+    min: sorted[0] as number,
+    max: sorted[sorted.length - 1] as number,
+    n: sorted.length,
+  }
+}
 
 /**
  * Project the evaluation log into the per-input-cell coverage map. A cell with
- * no evaluations reports `robustness: null` (honestly uncovered), never 0.
+ * no evaluations reports `score: null` (honestly uncovered), never zeros.
  */
 export function buildCoverage(cells: Cell[], log: EvalRecord[], threshold: number): CoverageCell[] {
   const byCell = new Map<string, EvalRecord[]>()
@@ -54,17 +80,36 @@ export function buildCoverage(cells: Cell[], log: EvalRecord[], threshold: numbe
   return cells.map((cell) => {
     const recs = byCell.get(cell.id) ?? []
     const runs = recs.length
-    if (runs === 0) return { cell, runs: 0, robustness: null, findingRate: 0, dimensions: {} }
-    const robustness = mean(recs.map((r) => r.ev.score))
+    if (runs === 0)
+      return { cell, runs: 0, score: null, findingRate: 0, dimensions: {}, latencyMs: null }
+
+    const score = distribution(recs.map((r) => r.ev.score))
+    const latencyMs = distribution(recs.map((r) => r.latencyMs))
     const findingRate = recs.filter((r) => r.interest >= threshold).length / runs
-    const dims: Record<string, number[]> = {}
+
+    const dimSamples: Record<string, number[]> = {}
     for (const r of recs) {
       for (const [k, v] of Object.entries(r.ev.scores ?? {})) {
-        ;(dims[k] ??= []).push(v)
+        ;(dimSamples[k] ??= []).push(v)
       }
     }
-    const dimensions: Record<string, number> = {}
-    for (const [k, xs] of Object.entries(dims)) dimensions[k] = mean(xs)
-    return { cell, runs, robustness, findingRate, dimensions }
+    const dimensions: Record<string, Distribution> = {}
+    for (const [k, xs] of Object.entries(dimSamples)) dimensions[k] = distribution(xs)
+
+    // Cost fields appear only when tracking was wired: known dollars sum, and
+    // tracked-but-unknown runs counted apart — never folded in as $0.
+    const tracked = recs.filter((r) => r.costUsd !== undefined)
+    const known = tracked.filter((r) => r.costUsd !== null)
+    const cost =
+      tracked.length > 0
+        ? {
+            costUsd: known.reduce((a, r) => a + (r.costUsd as number), 0),
+            ...(tracked.length > known.length
+              ? { costUnknownRuns: tracked.length - known.length }
+              : {}),
+          }
+        : {}
+
+    return { cell, runs, score, findingRate, dimensions, latencyMs, ...cost }
   })
 }
diff --git a/src/fuzz/explorer.ts b/src/fuzz/explorer.ts
index c5b6052..5297b61 100644
--- a/src/fuzz/explorer.ts
+++ b/src/fuzz/explorer.ts
@@ -156,12 +156,14 @@ export class BehaviorExplorer<S> {
 
   /** Fold one run's cost in: null counts as unknown (never $0); a known cost
    *  accrues toward the budget, lands in the ledger, and fires `onCost`. */
-  private recordRunCost(scenario: S, cell: Cell, ev: Evaluation): void {
-    if (!this.opts.costOf) return
+  /** Returns the run's known cost, `null` when tracked-but-unknown, `undefined`
+   *  when cost tracking is not wired — the log row mirrors this exactly. */
+  private recordRunCost(scenario: S, cell: Cell, ev: Evaluation): number | null | undefined {
+    if (!this.opts.costOf) return undefined
     const cost = this.opts.costOf(scenario, cell, ev)
     if (cost === null) {
       this.costUnknownRuns++
-      return
+      return null
     }
     if (typeof cost.usd !== 'number' || !Number.isFinite(cost.usd) || cost.usd < 0) {
       throw new RangeError(
@@ -178,6 +180,7 @@ export class BehaviorExplorer<S> {
       tags: { target: this.opts.target, cell: cell.id },
     })
     this.opts.onCost?.({ usd: cost.usd, channel: 'agent' })
+    return cost.usd
   }
 
   /** Elites whose INPUT cell matches — what the proposer mutates/deepens from. */
@@ -242,13 +245,24 @@ export class BehaviorExplorer<S> {
           // Consecutive failures trip the circuit breaker instead, so a dead
           // backend stops the run rather than burning the remaining budget.
           try {
+            const startedAt = performance.now()
             const ev = await this.opts.evaluate(scenario, cell)
+            // Consumer-measured latency wins (it can exclude judge time); the
+            // engine's wall-clock is the default so latency is never missing.
+            const latencyMs = ev.latencyMs ?? performance.now() - startedAt
             this.runsUsed++
             runsThisStep++
             this.consecutiveEvalErrors = 0
-            this.recordRunCost(scenario, cell, ev)
+            const costUsd = this.recordRunCost(scenario, cell, ev)
             const interest = this.objective.interest(ev, this.objectiveContext())
-            this.log.push({ cell, ev, interest, scenarioId: this.opts.scenarioId(scenario) })
+            this.log.push({
+              cell,
+              ev,
+              interest,
+              latencyMs,
+              ...(costUsd !== undefined ? { costUsd } : {}),
+              scenarioId: this.opts.scenarioId(scenario),
+            })
             this.opts.onProgress?.({ type: 'evaluated', cell, scenario, evaluation: ev })
 
             const bin = this.binId(cell, ev.descriptor)
diff --git a/src/fuzz/fuzz-agent.test.ts b/src/fuzz/fuzz-agent.test.ts
index 4587258..020f575 100644
--- a/src/fuzz/fuzz-agent.test.ts
+++ b/src/fuzz/fuzz-agent.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from 'vitest'
 import { renderCapsuleHtml } from './capsule'
-import { buildCoverage, enumerateCells } from './cube'
+import { buildCoverage, distribution, enumerateCells } from './cube'
 import { BehaviorExplorer } from './explorer'
 import { fuzzAgent } from './fuzz-agent'
 import { composeGates } from './gates'
@@ -84,9 +84,9 @@ describe('enumerateCells + buildCoverage', () => {
     const [c0, c1] = cells
     if (!c0 || !c1) throw new Error('expected cells')
     const ev: Evaluation = { valid: true, score: 0.8 }
-    const cov = buildCoverage(cells, [{ cell: c0, ev, interest: 0.2 }], 0.5)
-    expect(cov.find((c) => c.cell.id === c0.id)?.robustness).toBeCloseTo(0.8)
-    expect(cov.find((c) => c.cell.id === c1.id)?.robustness).toBeNull()
+    const cov = buildCoverage(cells, [{ cell: c0, ev, interest: 0.2, latencyMs: 12 }], 0.5)
+    expect(cov.find((c) => c.cell.id === c0.id)?.score?.mean).toBeCloseTo(0.8)
+    expect(cov.find((c) => c.cell.id === c1.id)?.score).toBeNull()
   })
 })
 
@@ -98,8 +98,8 @@ describe('fuzzAgent (adversarial preset)', () => {
     expect(capsule.stats.cellsCovered).toBe(4)
 
     for (const c of capsule.coverage) {
-      if (c.cell.coords.difficulty === 'hard') expect(c.robustness ?? 1).toBeLessThan(0.5)
-      else expect(c.robustness ?? 0).toBeGreaterThanOrEqual(0.5)
+      if (c.cell.coords.difficulty === 'hard') expect(c.score?.mean ?? 1).toBeLessThan(0.5)
+      else expect(c.score?.mean ?? 0).toBeGreaterThanOrEqual(0.5)
     }
     expect(capsule.stats.verifiedFindings).toBeGreaterThan(0)
     for (const f of capsule.findings) expect(f.cell.coords.difficulty).toBe('hard')
@@ -113,6 +113,7 @@ describe('fuzzAgent (adversarial preset)', () => {
     expect(covered.length).toBeGreaterThan(0)
     for (const c of covered) {
       expect(Object.keys(c.dimensions)).toEqual(expect.arrayContaining(['correctness', 'safety']))
+      for (const d of Object.values(c.dimensions)) expect(d.n).toBeGreaterThan(0)
     }
   })
 
@@ -127,10 +128,11 @@ describe('fuzzAgent (adversarial preset)', () => {
     expect(new Set(binIds).size).toBe(binIds.length)
   })
 
-  it('is deterministic for a fixed seed', async () => {
+  it('is deterministic for a fixed seed (latency excluded — wall-clock)', async () => {
     const a = await fuzzAgent(base)
     const b = await fuzzAgent(base)
-    expect(a.capsule.stats).toEqual(b.capsule.stats)
+    const strip = ({ latencyMs, ...rest }: typeof a.capsule.stats) => rest
+    expect(strip(a.capsule.stats)).toEqual(strip(b.capsule.stats))
   })
 
   it('validity gates drop candidates (verified <= candidate) and report only gate-passers', async () => {
@@ -330,3 +332,57 @@ describe('eval-error isolation (0.89.1)', () => {
     expect(html).toContain('stopped early')
   })
 })
+
+describe('distributions (coverage carries spread, never a bare mean)', () => {
+  it('per-cell score is a full distribution; latency is engine-measured', async () => {
+    const { capsule } = await fuzzAgent(base)
+    const covered = capsule.coverage.filter((c) => c.runs > 0)
+    for (const c of covered) {
+      const s = c.score
+      if (!s) throw new Error('covered cell missing score distribution')
+      expect(s.n).toBe(c.runs)
+      expect(s.min).toBeLessThanOrEqual(s.median)
+      expect(s.median).toBeLessThanOrEqual(s.p90)
+      expect(s.p90).toBeLessThanOrEqual(s.max)
+      expect(c.latencyMs?.n).toBe(c.runs)
+      expect(c.latencyMs?.min).toBeGreaterThanOrEqual(0)
+    }
+    expect(capsule.stats.robustness?.n).toBe(capsule.stats.cellsCovered)
+    expect(capsule.stats.latencyMs?.n).toBe(capsule.stats.totalRuns)
+  })
+
+  it('consumer-supplied latencyMs overrides engine wall-clock', async () => {
+    const timed: Evaluator<Scn> = async (s) => ({
+      valid: true,
+      score: s.difficulty === 'hard' ? 0.2 : 0.9,
+      scores: { correctness: 0.5 },
+      latencyMs: 1234,
+    })
+    const { capsule } = await fuzzAgent({ ...base, evaluate: timed, budget: 8 })
+    expect(capsule.stats.latencyMs?.median).toBe(1234)
+  })
+
+  it('per-cell cost appears only when costOf is wired, with unknown runs counted apart', async () => {
+    const { capsule: untracked } = await fuzzAgent({ ...base, budget: 8 })
+    for (const c of untracked.coverage) expect(c.costUsd).toBeUndefined()
+
+    let i = 0
+    const { capsule: tracked } = await fuzzAgent({
+      ...base,
+      budget: 8,
+      costOf: () => (i++ % 2 === 0 ? { usd: 0.01 } : null),
+    })
+    const cells = tracked.coverage.filter((c) => c.runs > 0)
+    expect(cells.some((c) => c.costUsd !== undefined)).toBe(true)
+    const totalKnown = cells.reduce((a, c) => a + (c.costUsd ?? 0), 0)
+    expect(totalKnown).toBeCloseTo(tracked.stats.costUsd ?? -1, 5)
+    const totalUnknown = cells.reduce((a, c) => a + (c.costUnknownRuns ?? 0), 0)
+    expect(totalUnknown).toBe(tracked.stats.costUnknownRuns)
+  })
+
+  it('distribution() throws on an empty sample — missing data is null, never zeros', () => {
+    expect(() => distribution([])).toThrow(/empty sample/)
+    const d = distribution([3, 1, 2])
+    expect(d).toEqual({ mean: 2, median: 2, p90: 3, min: 1, max: 3, n: 3 })
+  })
+})
diff --git a/src/fuzz/types.ts b/src/fuzz/types.ts
index ccf6aee..5bb9020 100644
--- a/src/fuzz/types.ts
+++ b/src/fuzz/types.ts
@@ -59,6 +59,9 @@ export interface Evaluation extends DefaultVerdict {
   runId?: string
   /** Structured labels, e.g. failure classes (`hallucination`, `refusal`). */
   labels?: string[]
+  /** Wall-clock for the evaluation, when the consumer measures it more precisely
+   *  than the engine can (e.g. excluding judge time). Engine-measured otherwise. */
+  latencyMs?: number
 }
 
 /** Run the target against one scenario in a cell. */
@@ -143,16 +146,35 @@ export interface ArchiveEntry<S> {
   interest: number
 }
 
+/** Summary of a sample — every aggregate carries its spread, never a bare mean. */
+export interface Distribution {
+  mean: number
+  median: number
+  p90: number
+  min: number
+  max: number
+  n: number
+}
+
 /** Per-INPUT-cell coverage — the planned-vs-covered map. */
 export interface CoverageCell {
   cell: Cell
   runs: number
-  /** Mean headline score in [0,1]; `null` when the cell was never run (honestly uncovered). */
-  robustness: number | null
+  /** Headline score distribution in [0,1]; `null` when the cell was never run
+   *  (honestly uncovered — never a fabricated zero). */
+  score: Distribution | null
   /** Fraction of runs the objective flagged as notable. */
   findingRate: number
-  /** Mean per-dimension scores — surfaces WHICH dimension is weak. */
-  dimensions: Record<string, number>
+  /** Per-dimension score distributions — surfaces WHICH dimension is weak and
+   *  how consistently. */
+  dimensions: Record<string, Distribution>
+  /** Evaluation wall-clock per run; engine-measured unless the evaluation
+   *  carried its own `latencyMs`. `null` when the cell was never run. */
+  latencyMs: Distribution | null
+  /** Known dollars spent in this cell — present only when cost tracking was
+   *  wired; runs with unknown cost are counted apart, never folded in as $0. */
+  costUsd?: number
+  costUnknownRuns?: number
 }
 
 /** The artifact every exploration produces. */
@@ -177,7 +199,12 @@ export interface CapsuleData<S> {
     behaviorBinsObserved: number
     candidateFindings: number
     verifiedFindings: number
-    meanRobustness: number
+    /** Distribution of per-cell mean scores across covered cells (cells weigh
+     *  equally — variance steering sends more runs to weak cells, so a
+     *  run-weighted average would bias low). `null` when nothing ran. */
+    robustness: Distribution | null
+    /** Evaluation wall-clock across all runs. `null` when nothing ran. */
+    latencyMs: Distribution | null
     /** Known dollars spent on this exploration's runs. Present only when cost
      *  tracking was wired (`costOf`) — absent means "not tracked", never $0. */
     costUsd?: number