tangle-network · drewstone · Jun 11, 2026 · Jun 11, 2026
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "agent-eval-rpc"
-version = "0.90.1"
+version = "0.91.0"
 description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
 readme = "README.md"
 requires-python = ">=3.10"

diff --git a/clients/python/src/agent_eval_rpc/__init__.py b/clients/python/src/agent_eval_rpc/__init__.py
@@ -58,7 +58,7 @@
 try:
     __version__ = version("agent-eval-rpc")
 except PackageNotFoundError:
-    __version__ = "0.90.1"
+    __version__ = "0.91.0"
 
 __all__ = [
     "Client",

diff --git a/examples/fuzz-legal-demo.mjs b/examples/fuzz-legal-demo.mjs
@@ -100,17 +100,18 @@ console.log(JSON.stringify({
   totalRuns: s.totalRuns,
   cells: `${s.cellsCovered}/${s.cellsTotal}`,
   behaviorBinsObserved: s.behaviorBinsObserved,
-  meanRobustness: +s.meanRobustness.toFixed(3),
+  robustness: s.robustness && { mean: +s.robustness.mean.toFixed(3), median: +s.robustness.median.toFixed(3), min: +s.robustness.min.toFixed(3), max: +s.robustness.max.toFixed(3) },
+  medianLatencyMs: s.latencyMs && Math.round(s.latencyMs.median),
   candidateFindings: s.candidateFindings,
   verifiedFindings: s.verifiedFindings,
   weakestCells: capsule.coverage
-    .filter((c) => c.robustness != null)
-    .sort((a, b) => a.robustness - b.robustness)
+    .filter((c) => c.score != null)
+    .sort((a, b) => a.score.mean - b.score.mean)
     .slice(0, 4)
     .map((c) => ({
       cell: Object.values(c.cell.coords).join('·'),
-      robustness: +c.robustness.toFixed(2),
-      weakestDim: Object.entries(c.dimensions).sort((a, b) => a[1] - b[1])[0]?.[0],
+      score: { mean: +c.score.mean.toFixed(2), min: +c.score.min.toFixed(2) },
+      weakestDim: Object.entries(c.dimensions).sort((a, b) => a[1].mean - b[1].mean)[0]?.[0],
     })),
   topFinding: capsule.findings[0]
     ? {

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.90.1",
+  "version": "0.91.0",
   "description": "Evaluate and improve AI agents from runs, traces, judges, and feedback. Compare candidates, cluster failures, measure lift, and gate releases.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {

diff --git a/src/fuzz/capsule.ts b/src/fuzz/capsule.ts
@@ -9,8 +9,8 @@
  */
 
 import type { EvalRecord } from './cube'
-import { buildCoverage } from './cube'
-import type { ArchiveEntry, CapsuleData, Cell, CoverageCell, Finding } from './types'
+import { buildCoverage, distribution } from './cube'
+import type { ArchiveEntry, CapsuleData, Cell, CoverageCell, Distribution, Finding } from './types'
 
 export interface BuildCapsuleInput<S> {
   target: string
@@ -35,8 +35,11 @@ export interface BuildCapsuleInput<S> {
 export function buildCapsule<S>(input: BuildCapsuleInput<S>): CapsuleData<S> {
   const coverage = buildCoverage(input.cells, input.log, input.threshold)
   const covered = coverage.filter((c) => c.runs > 0)
-  const meanRobustness =
-    covered.length === 0 ? 0 : covered.reduce((a, c) => a + (c.robustness ?? 0), 0) / covered.length
+  // Cells weigh equally: variance steering sends more runs to weak cells, so a
+  // run-weighted average would bias the headline low.
+  const robustness =
+    covered.length === 0 ? null : distribution(covered.map((c) => (c.score as Distribution).mean))
+  const latencyMs = input.log.length === 0 ? null : distribution(input.log.map((r) => r.latencyMs))
   // Measured-descriptor bins beyond the bare input cell — observed, never planned.
   const behaviorBinsObserved = input.archive.filter((e) => e.binId !== e.cell.id).length
 
@@ -53,7 +56,8 @@ export function buildCapsule<S>(input: BuildCapsuleInput<S>): CapsuleData<S> {
       behaviorBinsObserved,
       candidateFindings: input.candidateFindings,
       verifiedFindings: input.findings.length,
-      meanRobustness,
+      robustness,
+      latencyMs,
       ...(input.cost
         ? { costUsd: input.cost.costUsd, costUnknownRuns: input.cost.costUnknownRuns }
         : {}),
@@ -99,7 +103,7 @@ function deriveAxes(coverage: CoverageCell[]): Array<{ name: string; values: str
 
 /** The weakest dimension chip for a cell, e.g. `safety 32%` — shown when scores exist. */
 function weakestDim(c: CoverageCell): string {
-  const entries = Object.entries(c.dimensions)
+  const entries = Object.entries(c.dimensions).map(([k, d]) => [k, d.mean] as [string, number])
   if (entries.length === 0) return ''
   const sorted = entries.sort((a, b) => a[1] - b[1])
   const w = sorted[0]
@@ -111,9 +115,9 @@ function heatmapHtml(coverage: CoverageCell[]): string {
   const axes = deriveAxes(coverage)
   const byId = new Map(coverage.map((c) => [c.cell.id, c]))
   const tile = (c: CoverageCell | undefined, label: string): string => {
-    const r = c?.robustness ?? null
-    const title = c
-      ? `${pct(r ?? 0)} robust · ${c.runs} runs · ${pct(c.findingRate)} flagged`
+    const r = c?.score?.mean ?? null
+    const title = c?.score
+      ? `${pct(c.score.mean)} robust (median ${pct(c.score.median)}, min ${pct(c.score.min)}) · ${c.runs} runs · ${pct(c.findingRate)} flagged${c.latencyMs ? ` · ${(c.latencyMs.median / 1000).toFixed(1)}s median` : ''}${c.costUsd !== undefined ? ` · $${c.costUsd.toFixed(2)}` : ''}`
       : 'not covered'
     return `<div class="tile" style="background:${robustnessColor(r)}" title="${esc(title)}">${label ? `<span class="tl">${esc(label)}</span>` : ''}<span class="tv">${c && r != null ? pct(r) : '—'}</span>${c ? weakestDim(c) : ''}</div>`
   }
@@ -136,7 +140,7 @@ function heatmapHtml(coverage: CoverageCell[]): string {
     return `<div class="axis-label">rows: <b>${esc(rowAxis.name)}</b> · cols: <b>${esc(colAxis.name)}</b></div><table class="heat">${head}${rows}</table>`
   }
 
-  const sorted = [...coverage].sort((a, b) => (a.robustness ?? 2) - (b.robustness ?? 2))
+  const sorted = [...coverage].sort((a, b) => (a.score?.mean ?? 2) - (b.score?.mean ?? 2))
   return `<div class="grid">${sorted.map((c) => tile(c, Object.values(c.cell.coords).join(' · '))).join('')}</div>`
 }
 
@@ -236,7 +240,9 @@ table.heat th.rh{text-align:right}
 <h1>${esc(capsule.objective)} exploration · ${esc(capsule.target)}</h1>
 <div class="sub">${s.totalRuns} scenarios across ${s.cellsCovered}/${s.cellsTotal} planned cells${s.behaviorBinsObserved > 0 ? ` · ${s.behaviorBinsObserved} measured behavior bins` : ''}${stamp ? ` · ${esc(stamp)}` : ''}</div>
 <div class="kpis">
-${kpi('mean robustness', pct(s.meanRobustness), s.meanRobustness < 0.6 ? '#e58a96' : '#5ad17a')}
+${s.robustness ? kpi('robustness', `${pct(s.robustness.mean)}`, s.robustness.mean < 0.6 ? '#e58a96' : '#5ad17a') : ''}
+${s.robustness ? kpi('cell spread', `${pct(s.robustness.min)}–${pct(s.robustness.max)}`) : ''}
+${s.latencyMs ? kpi('median latency', `${(s.latencyMs.median / 1000).toFixed(1)}s`, s.latencyMs.p90 > 4 * s.latencyMs.median ? '#e5b566' : '#e6e6e6') : ''}
 ${kpi('verified findings', String(s.verifiedFindings), s.verifiedFindings > 0 ? '#e58a96' : '#5ad17a')}
 ${kpi('cells covered', `${s.cellsCovered}/${s.cellsTotal}`)}
 ${kpi('scenarios run', String(s.totalRuns))}

diff --git a/src/fuzz/cube.ts b/src/fuzz/cube.ts
@@ -3,19 +3,25 @@
  *
  * Cells are the cartesian product of the input axes — the stratification plan,
  * enumerable up front so the planned-vs-covered denominator is honest. Coverage
- * is projected from the evaluation log: per cell, mean headline robustness, the
- * mean of each scored dimension (so the map shows WHICH dimension is weak), and
- * the rate at which the active objective flagged a candidate.
+ * is projected from the evaluation log: per cell, the full DISTRIBUTION of the
+ * headline score, of each scored dimension, and of evaluation latency — a bare
+ * mean hides outliers, so every aggregate carries its spread. Per-cell cost is
+ * split known-dollars vs unknown-runs, never folded into a fabricated $0.
  */
 
-import type { BehaviorSpace, Cell, CoverageCell, Evaluation } from './types'
+import type { BehaviorSpace, Cell, CoverageCell, Distribution, Evaluation } from './types'
 
 /** One recorded evaluation — the unit coverage and the capsule are built from. */
 export interface EvalRecord {
   cell: Cell
   ev: Evaluation
   /** The objective's interest score for this evaluation. */
   interest: number
+  /** Evaluation wall-clock — engine-measured unless `ev.latencyMs` overrode it. */
+  latencyMs: number
+  /** Known dollars for this run. `null` = cost tracking was wired but this
+   *  run's cost was unknowable (counted apart). Absent = not tracked at all. */
+  costUsd?: number | null
 }
 
 /** Enumerate every input cell (cartesian product of the axes), in stable order. */
@@ -37,12 +43,32 @@ export function cellId(space: BehaviorSpace, coords: Record<string, string>): st
   return space.axes.map((a) => `${a.name}=${coords[a.name]}`).join('|')
 }
 
-const mean = (xs: number[]): number =>
-  xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length
+/** Nearest-rank percentile on a pre-sorted ascending sample. */
+function percentile(sorted: number[], p: number): number {
+  const idx = Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))
+  return sorted[idx] as number
+}
+
+/** Summarize a sample. Throws on an empty sample — callers represent "no data"
+ *  as `null`, never as a zeroed distribution. */
+export function distribution(values: number[]): Distribution {
+  if (values.length === 0)
+    throw new Error('distribution: empty sample — represent missing data as null, not zeros')
+  const sorted = [...values].sort((a, b) => a - b)
+  const mean = sorted.reduce((a, b) => a + b, 0) / sorted.length
+  return {
+    mean,
+    median: percentile(sorted, 0.5),
+    p90: percentile(sorted, 0.9),
+    min: sorted[0] as number,
+    max: sorted[sorted.length - 1] as number,
+    n: sorted.length,
+  }
+}
 
 /**
  * Project the evaluation log into the per-input-cell coverage map. A cell with
- * no evaluations reports `robustness: null` (honestly uncovered), never 0.
+ * no evaluations reports `score: null` (honestly uncovered), never zeros.
  */
 export function buildCoverage(cells: Cell[], log: EvalRecord[], threshold: number): CoverageCell[] {
   const byCell = new Map<string, EvalRecord[]>()
@@ -54,17 +80,36 @@ export function buildCoverage(cells: Cell[], log: EvalRecord[], threshold: numbe
   return cells.map((cell) => {
     const recs = byCell.get(cell.id) ?? []
     const runs = recs.length
-    if (runs === 0) return { cell, runs: 0, robustness: null, findingRate: 0, dimensions: {} }
-    const robustness = mean(recs.map((r) => r.ev.score))
+    if (runs === 0)
+      return { cell, runs: 0, score: null, findingRate: 0, dimensions: {}, latencyMs: null }
+
+    const score = distribution(recs.map((r) => r.ev.score))
+    const latencyMs = distribution(recs.map((r) => r.latencyMs))
     const findingRate = recs.filter((r) => r.interest >= threshold).length / runs
-    const dims: Record<string, number[]> = {}
+
+    const dimSamples: Record<string, number[]> = {}
     for (const r of recs) {
       for (const [k, v] of Object.entries(r.ev.scores ?? {})) {
-        ;(dims[k] ??= []).push(v)
+        ;(dimSamples[k] ??= []).push(v)
       }
     }
-    const dimensions: Record<string, number> = {}
-    for (const [k, xs] of Object.entries(dims)) dimensions[k] = mean(xs)
-    return { cell, runs, robustness, findingRate, dimensions }
+    const dimensions: Record<string, Distribution> = {}
+    for (const [k, xs] of Object.entries(dimSamples)) dimensions[k] = distribution(xs)
+
+    // Cost fields appear only when tracking was wired: known dollars sum, and
+    // tracked-but-unknown runs counted apart — never folded in as $0.
+    const tracked = recs.filter((r) => r.costUsd !== undefined)
+    const known = tracked.filter((r) => r.costUsd !== null)
+    const cost =
+      tracked.length > 0
+        ? {
+            costUsd: known.reduce((a, r) => a + (r.costUsd as number), 0),
+            ...(tracked.length > known.length
+              ? { costUnknownRuns: tracked.length - known.length }
+              : {}),
+          }
+        : {}
+
+    return { cell, runs, score, findingRate, dimensions, latencyMs, ...cost }
   })
 }
diff --git a/src/fuzz/explorer.ts b/src/fuzz/explorer.ts
@@ -156,12 +156,14 @@ export class BehaviorExplorer<S> {
 
   /** Fold one run's cost in: null counts as unknown (never $0); a known cost
    *  accrues toward the budget, lands in the ledger, and fires `onCost`. */
-  private recordRunCost(scenario: S, cell: Cell, ev: Evaluation): void {
-    if (!this.opts.costOf) return
+  /** Returns the run's known cost, `null` when tracked-but-unknown, `undefined`
+   *  when cost tracking is not wired — the log row mirrors this exactly. */
+  private recordRunCost(scenario: S, cell: Cell, ev: Evaluation): number | null | undefined {
+    if (!this.opts.costOf) return undefined
     const cost = this.opts.costOf(scenario, cell, ev)
     if (cost === null) {
       this.costUnknownRuns++
-      return
+      return null
     }
     if (typeof cost.usd !== 'number' || !Number.isFinite(cost.usd) || cost.usd < 0) {
       throw new RangeError(
@@ -178,6 +180,7 @@ export class BehaviorExplorer<S> {
       tags: { target: this.opts.target, cell: cell.id },
     })
     this.opts.onCost?.({ usd: cost.usd, channel: 'agent' })
+    return cost.usd
   }
 
   /** Elites whose INPUT cell matches — what the proposer mutates/deepens from. */
@@ -242,13 +245,24 @@ export class BehaviorExplorer<S> {
           // Consecutive failures trip the circuit breaker instead, so a dead
           // backend stops the run rather than burning the remaining budget.
           try {
+            const startedAt = performance.now()
             const ev = await this.opts.evaluate(scenario, cell)
+            // Consumer-measured latency wins (it can exclude judge time); the
+            // engine's wall-clock is the default so latency is never missing.
+            const latencyMs = ev.latencyMs ?? performance.now() - startedAt
             this.runsUsed++
             runsThisStep++
             this.consecutiveEvalErrors = 0
-            this.recordRunCost(scenario, cell, ev)
+            const costUsd = this.recordRunCost(scenario, cell, ev)
             const interest = this.objective.interest(ev, this.objectiveContext())
-            this.log.push({ cell, ev, interest, scenarioId: this.opts.scenarioId(scenario) })
+            this.log.push({
+              cell,
+              ev,
+              interest,
+              latencyMs,
+              ...(costUsd !== undefined ? { costUsd } : {}),
+              scenarioId: this.opts.scenarioId(scenario),
+            })
             this.opts.onProgress?.({ type: 'evaluated', cell, scenario, evaluation: ev })
 
             const bin = this.binId(cell, ev.descriptor)