From 323b8c19d8c1107cee9ce889dc65a8854908ffc1 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Wed, 10 Jun 2026 18:35:01 -0600 Subject: [PATCH] =?UTF-8?q?feat(fuzz):=20coverage=20carries=20distribution?= =?UTF-8?q?s=20=E2=80=94=20score/dimension/latency=20spread=20+=20per-cell?= =?UTF-8?q?=20cost=20(0.91.0)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A bare mean hides outliers. CoverageCell.score and every dimension are now full Distributions (mean/median/p90/min/max/n); evaluation latency is engine-measured per run (consumer latencyMs overrides) and aggregates per cell and capsule-wide; per-cell cost splits known dollars from tracked-but-unknown runs (absent when tracking is unwired — never a fabricated $0). Capsule stats replace the bare meanRobustness with a robustness Distribution over per-cell means (cells weigh equally — variance steering would bias a run-weighted average low) plus a latency Distribution over all runs. HTML tiles color by mean and carry median/min/latency/cost in the tooltip; KPIs add cell spread + median latency. --- clients/python/pyproject.toml | 2 +- clients/python/src/agent_eval_rpc/__init__.py | 2 +- examples/fuzz-legal-demo.mjs | 11 +-- package.json | 2 +- src/fuzz/capsule.ts | 28 ++++--- src/fuzz/cube.ts | 73 +++++++++++++++---- src/fuzz/explorer.ts | 24 ++++-- src/fuzz/fuzz-agent.test.ts | 72 ++++++++++++++++-- src/fuzz/types.ts | 37 ++++++++-- 9 files changed, 200 insertions(+), 51 deletions(-) diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index 5477b33..e411b1f 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "agent-eval-rpc" -version = "0.90.1" +version = "0.91.0" description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client." readme = "README.md" requires-python = ">=3.10" diff --git a/clients/python/src/agent_eval_rpc/__init__.py b/clients/python/src/agent_eval_rpc/__init__.py index b325ae7..8d49638 100644 --- a/clients/python/src/agent_eval_rpc/__init__.py +++ b/clients/python/src/agent_eval_rpc/__init__.py @@ -58,7 +58,7 @@ try: __version__ = version("agent-eval-rpc") except PackageNotFoundError: - __version__ = "0.90.1" + __version__ = "0.91.0" __all__ = [ "Client", diff --git a/examples/fuzz-legal-demo.mjs b/examples/fuzz-legal-demo.mjs index 76f5387..d842ef4 100644 --- a/examples/fuzz-legal-demo.mjs +++ b/examples/fuzz-legal-demo.mjs @@ -100,17 +100,18 @@ console.log(JSON.stringify({ totalRuns: s.totalRuns, cells: `${s.cellsCovered}/${s.cellsTotal}`, behaviorBinsObserved: s.behaviorBinsObserved, - meanRobustness: +s.meanRobustness.toFixed(3), + robustness: s.robustness && { mean: +s.robustness.mean.toFixed(3), median: +s.robustness.median.toFixed(3), min: +s.robustness.min.toFixed(3), max: +s.robustness.max.toFixed(3) }, + medianLatencyMs: s.latencyMs && Math.round(s.latencyMs.median), candidateFindings: s.candidateFindings, verifiedFindings: s.verifiedFindings, weakestCells: capsule.coverage - .filter((c) => c.robustness != null) - .sort((a, b) => a.robustness - b.robustness) + .filter((c) => c.score != null) + .sort((a, b) => a.score.mean - b.score.mean) .slice(0, 4) .map((c) => ({ cell: Object.values(c.cell.coords).join('·'), - robustness: +c.robustness.toFixed(2), - weakestDim: Object.entries(c.dimensions).sort((a, b) => a[1] - b[1])[0]?.[0], + score: { mean: +c.score.mean.toFixed(2), min: +c.score.min.toFixed(2) }, + weakestDim: Object.entries(c.dimensions).sort((a, b) => a[1].mean - b[1].mean)[0]?.[0], })), topFinding: capsule.findings[0] ? { diff --git a/package.json b/package.json index 7ec54ac..b336786 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-eval", - "version": "0.90.1", + "version": "0.91.0", "description": "Evaluate and improve AI agents from runs, traces, judges, and feedback. Compare candidates, cluster failures, measure lift, and gate releases.", "homepage": "https://github.com/tangle-network/agent-eval#readme", "repository": { diff --git a/src/fuzz/capsule.ts b/src/fuzz/capsule.ts index ef7287c..56bbbf7 100644 --- a/src/fuzz/capsule.ts +++ b/src/fuzz/capsule.ts @@ -9,8 +9,8 @@ */ import type { EvalRecord } from './cube' -import { buildCoverage } from './cube' -import type { ArchiveEntry, CapsuleData, Cell, CoverageCell, Finding } from './types' +import { buildCoverage, distribution } from './cube' +import type { ArchiveEntry, CapsuleData, Cell, CoverageCell, Distribution, Finding } from './types' export interface BuildCapsuleInput { target: string @@ -35,8 +35,11 @@ export interface BuildCapsuleInput { export function buildCapsule(input: BuildCapsuleInput): CapsuleData { const coverage = buildCoverage(input.cells, input.log, input.threshold) const covered = coverage.filter((c) => c.runs > 0) - const meanRobustness = - covered.length === 0 ? 0 : covered.reduce((a, c) => a + (c.robustness ?? 0), 0) / covered.length + // Cells weigh equally: variance steering sends more runs to weak cells, so a + // run-weighted average would bias the headline low. + const robustness = + covered.length === 0 ? null : distribution(covered.map((c) => (c.score as Distribution).mean)) + const latencyMs = input.log.length === 0 ? null : distribution(input.log.map((r) => r.latencyMs)) // Measured-descriptor bins beyond the bare input cell — observed, never planned. const behaviorBinsObserved = input.archive.filter((e) => e.binId !== e.cell.id).length @@ -53,7 +56,8 @@ export function buildCapsule(input: BuildCapsuleInput): CapsuleData { behaviorBinsObserved, candidateFindings: input.candidateFindings, verifiedFindings: input.findings.length, - meanRobustness, + robustness, + latencyMs, ...(input.cost ? { costUsd: input.cost.costUsd, costUnknownRuns: input.cost.costUnknownRuns } : {}), @@ -99,7 +103,7 @@ function deriveAxes(coverage: CoverageCell[]): Array<{ name: string; values: str /** The weakest dimension chip for a cell, e.g. `safety 32%` — shown when scores exist. */ function weakestDim(c: CoverageCell): string { - const entries = Object.entries(c.dimensions) + const entries = Object.entries(c.dimensions).map(([k, d]) => [k, d.mean] as [string, number]) if (entries.length === 0) return '' const sorted = entries.sort((a, b) => a[1] - b[1]) const w = sorted[0] @@ -111,9 +115,9 @@ function heatmapHtml(coverage: CoverageCell[]): string { const axes = deriveAxes(coverage) const byId = new Map(coverage.map((c) => [c.cell.id, c])) const tile = (c: CoverageCell | undefined, label: string): string => { - const r = c?.robustness ?? null - const title = c - ? `${pct(r ?? 0)} robust · ${c.runs} runs · ${pct(c.findingRate)} flagged` + const r = c?.score?.mean ?? null + const title = c?.score + ? `${pct(c.score.mean)} robust (median ${pct(c.score.median)}, min ${pct(c.score.min)}) · ${c.runs} runs · ${pct(c.findingRate)} flagged${c.latencyMs ? ` · ${(c.latencyMs.median / 1000).toFixed(1)}s median` : ''}${c.costUsd !== undefined ? ` · $${c.costUsd.toFixed(2)}` : ''}` : 'not covered' return `
${label ? `${esc(label)}` : ''}${c && r != null ? pct(r) : '—'}${c ? weakestDim(c) : ''}
` } @@ -136,7 +140,7 @@ function heatmapHtml(coverage: CoverageCell[]): string { return `
rows: ${esc(rowAxis.name)} · cols: ${esc(colAxis.name)}
${head}${rows}
` } - const sorted = [...coverage].sort((a, b) => (a.robustness ?? 2) - (b.robustness ?? 2)) + const sorted = [...coverage].sort((a, b) => (a.score?.mean ?? 2) - (b.score?.mean ?? 2)) return `
${sorted.map((c) => tile(c, Object.values(c.cell.coords).join(' · '))).join('')}
` } @@ -236,7 +240,9 @@ table.heat th.rh{text-align:right}

${esc(capsule.objective)} exploration · ${esc(capsule.target)}

${s.totalRuns} scenarios across ${s.cellsCovered}/${s.cellsTotal} planned cells${s.behaviorBinsObserved > 0 ? ` · ${s.behaviorBinsObserved} measured behavior bins` : ''}${stamp ? ` · ${esc(stamp)}` : ''}
-${kpi('mean robustness', pct(s.meanRobustness), s.meanRobustness < 0.6 ? '#e58a96' : '#5ad17a')} +${s.robustness ? kpi('robustness', `${pct(s.robustness.mean)}`, s.robustness.mean < 0.6 ? '#e58a96' : '#5ad17a') : ''} +${s.robustness ? kpi('cell spread', `${pct(s.robustness.min)}–${pct(s.robustness.max)}`) : ''} +${s.latencyMs ? kpi('median latency', `${(s.latencyMs.median / 1000).toFixed(1)}s`, s.latencyMs.p90 > 4 * s.latencyMs.median ? '#e5b566' : '#e6e6e6') : ''} ${kpi('verified findings', String(s.verifiedFindings), s.verifiedFindings > 0 ? '#e58a96' : '#5ad17a')} ${kpi('cells covered', `${s.cellsCovered}/${s.cellsTotal}`)} ${kpi('scenarios run', String(s.totalRuns))} diff --git a/src/fuzz/cube.ts b/src/fuzz/cube.ts index 8d980cc..6f24c97 100644 --- a/src/fuzz/cube.ts +++ b/src/fuzz/cube.ts @@ -3,12 +3,13 @@ * * Cells are the cartesian product of the input axes — the stratification plan, * enumerable up front so the planned-vs-covered denominator is honest. Coverage - * is projected from the evaluation log: per cell, mean headline robustness, the - * mean of each scored dimension (so the map shows WHICH dimension is weak), and - * the rate at which the active objective flagged a candidate. + * is projected from the evaluation log: per cell, the full DISTRIBUTION of the + * headline score, of each scored dimension, and of evaluation latency — a bare + * mean hides outliers, so every aggregate carries its spread. Per-cell cost is + * split known-dollars vs unknown-runs, never folded into a fabricated $0. */ -import type { BehaviorSpace, Cell, CoverageCell, Evaluation } from './types' +import type { BehaviorSpace, Cell, CoverageCell, Distribution, Evaluation } from './types' /** One recorded evaluation — the unit coverage and the capsule are built from. */ export interface EvalRecord { @@ -16,6 +17,11 @@ export interface EvalRecord { ev: Evaluation /** The objective's interest score for this evaluation. */ interest: number + /** Evaluation wall-clock — engine-measured unless `ev.latencyMs` overrode it. */ + latencyMs: number + /** Known dollars for this run. `null` = cost tracking was wired but this + * run's cost was unknowable (counted apart). Absent = not tracked at all. */ + costUsd?: number | null } /** Enumerate every input cell (cartesian product of the axes), in stable order. */ @@ -37,12 +43,32 @@ export function cellId(space: BehaviorSpace, coords: Record): st return space.axes.map((a) => `${a.name}=${coords[a.name]}`).join('|') } -const mean = (xs: number[]): number => - xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length +/** Nearest-rank percentile on a pre-sorted ascending sample. */ +function percentile(sorted: number[], p: number): number { + const idx = Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1)) + return sorted[idx] as number +} + +/** Summarize a sample. Throws on an empty sample — callers represent "no data" + * as `null`, never as a zeroed distribution. */ +export function distribution(values: number[]): Distribution { + if (values.length === 0) + throw new Error('distribution: empty sample — represent missing data as null, not zeros') + const sorted = [...values].sort((a, b) => a - b) + const mean = sorted.reduce((a, b) => a + b, 0) / sorted.length + return { + mean, + median: percentile(sorted, 0.5), + p90: percentile(sorted, 0.9), + min: sorted[0] as number, + max: sorted[sorted.length - 1] as number, + n: sorted.length, + } +} /** * Project the evaluation log into the per-input-cell coverage map. A cell with - * no evaluations reports `robustness: null` (honestly uncovered), never 0. + * no evaluations reports `score: null` (honestly uncovered), never zeros. */ export function buildCoverage(cells: Cell[], log: EvalRecord[], threshold: number): CoverageCell[] { const byCell = new Map() @@ -54,17 +80,36 @@ export function buildCoverage(cells: Cell[], log: EvalRecord[], threshold: numbe return cells.map((cell) => { const recs = byCell.get(cell.id) ?? [] const runs = recs.length - if (runs === 0) return { cell, runs: 0, robustness: null, findingRate: 0, dimensions: {} } - const robustness = mean(recs.map((r) => r.ev.score)) + if (runs === 0) + return { cell, runs: 0, score: null, findingRate: 0, dimensions: {}, latencyMs: null } + + const score = distribution(recs.map((r) => r.ev.score)) + const latencyMs = distribution(recs.map((r) => r.latencyMs)) const findingRate = recs.filter((r) => r.interest >= threshold).length / runs - const dims: Record = {} + + const dimSamples: Record = {} for (const r of recs) { for (const [k, v] of Object.entries(r.ev.scores ?? {})) { - ;(dims[k] ??= []).push(v) + ;(dimSamples[k] ??= []).push(v) } } - const dimensions: Record = {} - for (const [k, xs] of Object.entries(dims)) dimensions[k] = mean(xs) - return { cell, runs, robustness, findingRate, dimensions } + const dimensions: Record = {} + for (const [k, xs] of Object.entries(dimSamples)) dimensions[k] = distribution(xs) + + // Cost fields appear only when tracking was wired: known dollars sum, and + // tracked-but-unknown runs counted apart — never folded in as $0. + const tracked = recs.filter((r) => r.costUsd !== undefined) + const known = tracked.filter((r) => r.costUsd !== null) + const cost = + tracked.length > 0 + ? { + costUsd: known.reduce((a, r) => a + (r.costUsd as number), 0), + ...(tracked.length > known.length + ? { costUnknownRuns: tracked.length - known.length } + : {}), + } + : {} + + return { cell, runs, score, findingRate, dimensions, latencyMs, ...cost } }) } diff --git a/src/fuzz/explorer.ts b/src/fuzz/explorer.ts index c5b6052..5297b61 100644 --- a/src/fuzz/explorer.ts +++ b/src/fuzz/explorer.ts @@ -156,12 +156,14 @@ export class BehaviorExplorer { /** Fold one run's cost in: null counts as unknown (never $0); a known cost * accrues toward the budget, lands in the ledger, and fires `onCost`. */ - private recordRunCost(scenario: S, cell: Cell, ev: Evaluation): void { - if (!this.opts.costOf) return + /** Returns the run's known cost, `null` when tracked-but-unknown, `undefined` + * when cost tracking is not wired — the log row mirrors this exactly. */ + private recordRunCost(scenario: S, cell: Cell, ev: Evaluation): number | null | undefined { + if (!this.opts.costOf) return undefined const cost = this.opts.costOf(scenario, cell, ev) if (cost === null) { this.costUnknownRuns++ - return + return null } if (typeof cost.usd !== 'number' || !Number.isFinite(cost.usd) || cost.usd < 0) { throw new RangeError( @@ -178,6 +180,7 @@ export class BehaviorExplorer { tags: { target: this.opts.target, cell: cell.id }, }) this.opts.onCost?.({ usd: cost.usd, channel: 'agent' }) + return cost.usd } /** Elites whose INPUT cell matches — what the proposer mutates/deepens from. */ @@ -242,13 +245,24 @@ export class BehaviorExplorer { // Consecutive failures trip the circuit breaker instead, so a dead // backend stops the run rather than burning the remaining budget. try { + const startedAt = performance.now() const ev = await this.opts.evaluate(scenario, cell) + // Consumer-measured latency wins (it can exclude judge time); the + // engine's wall-clock is the default so latency is never missing. + const latencyMs = ev.latencyMs ?? performance.now() - startedAt this.runsUsed++ runsThisStep++ this.consecutiveEvalErrors = 0 - this.recordRunCost(scenario, cell, ev) + const costUsd = this.recordRunCost(scenario, cell, ev) const interest = this.objective.interest(ev, this.objectiveContext()) - this.log.push({ cell, ev, interest, scenarioId: this.opts.scenarioId(scenario) }) + this.log.push({ + cell, + ev, + interest, + latencyMs, + ...(costUsd !== undefined ? { costUsd } : {}), + scenarioId: this.opts.scenarioId(scenario), + }) this.opts.onProgress?.({ type: 'evaluated', cell, scenario, evaluation: ev }) const bin = this.binId(cell, ev.descriptor) diff --git a/src/fuzz/fuzz-agent.test.ts b/src/fuzz/fuzz-agent.test.ts index 4587258..020f575 100644 --- a/src/fuzz/fuzz-agent.test.ts +++ b/src/fuzz/fuzz-agent.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest' import { renderCapsuleHtml } from './capsule' -import { buildCoverage, enumerateCells } from './cube' +import { buildCoverage, distribution, enumerateCells } from './cube' import { BehaviorExplorer } from './explorer' import { fuzzAgent } from './fuzz-agent' import { composeGates } from './gates' @@ -84,9 +84,9 @@ describe('enumerateCells + buildCoverage', () => { const [c0, c1] = cells if (!c0 || !c1) throw new Error('expected cells') const ev: Evaluation = { valid: true, score: 0.8 } - const cov = buildCoverage(cells, [{ cell: c0, ev, interest: 0.2 }], 0.5) - expect(cov.find((c) => c.cell.id === c0.id)?.robustness).toBeCloseTo(0.8) - expect(cov.find((c) => c.cell.id === c1.id)?.robustness).toBeNull() + const cov = buildCoverage(cells, [{ cell: c0, ev, interest: 0.2, latencyMs: 12 }], 0.5) + expect(cov.find((c) => c.cell.id === c0.id)?.score?.mean).toBeCloseTo(0.8) + expect(cov.find((c) => c.cell.id === c1.id)?.score).toBeNull() }) }) @@ -98,8 +98,8 @@ describe('fuzzAgent (adversarial preset)', () => { expect(capsule.stats.cellsCovered).toBe(4) for (const c of capsule.coverage) { - if (c.cell.coords.difficulty === 'hard') expect(c.robustness ?? 1).toBeLessThan(0.5) - else expect(c.robustness ?? 0).toBeGreaterThanOrEqual(0.5) + if (c.cell.coords.difficulty === 'hard') expect(c.score?.mean ?? 1).toBeLessThan(0.5) + else expect(c.score?.mean ?? 0).toBeGreaterThanOrEqual(0.5) } expect(capsule.stats.verifiedFindings).toBeGreaterThan(0) for (const f of capsule.findings) expect(f.cell.coords.difficulty).toBe('hard') @@ -113,6 +113,7 @@ describe('fuzzAgent (adversarial preset)', () => { expect(covered.length).toBeGreaterThan(0) for (const c of covered) { expect(Object.keys(c.dimensions)).toEqual(expect.arrayContaining(['correctness', 'safety'])) + for (const d of Object.values(c.dimensions)) expect(d.n).toBeGreaterThan(0) } }) @@ -127,10 +128,11 @@ describe('fuzzAgent (adversarial preset)', () => { expect(new Set(binIds).size).toBe(binIds.length) }) - it('is deterministic for a fixed seed', async () => { + it('is deterministic for a fixed seed (latency excluded — wall-clock)', async () => { const a = await fuzzAgent(base) const b = await fuzzAgent(base) - expect(a.capsule.stats).toEqual(b.capsule.stats) + const strip = ({ latencyMs, ...rest }: typeof a.capsule.stats) => rest + expect(strip(a.capsule.stats)).toEqual(strip(b.capsule.stats)) }) it('validity gates drop candidates (verified <= candidate) and report only gate-passers', async () => { @@ -330,3 +332,57 @@ describe('eval-error isolation (0.89.1)', () => { expect(html).toContain('stopped early') }) }) + +describe('distributions (coverage carries spread, never a bare mean)', () => { + it('per-cell score is a full distribution; latency is engine-measured', async () => { + const { capsule } = await fuzzAgent(base) + const covered = capsule.coverage.filter((c) => c.runs > 0) + for (const c of covered) { + const s = c.score + if (!s) throw new Error('covered cell missing score distribution') + expect(s.n).toBe(c.runs) + expect(s.min).toBeLessThanOrEqual(s.median) + expect(s.median).toBeLessThanOrEqual(s.p90) + expect(s.p90).toBeLessThanOrEqual(s.max) + expect(c.latencyMs?.n).toBe(c.runs) + expect(c.latencyMs?.min).toBeGreaterThanOrEqual(0) + } + expect(capsule.stats.robustness?.n).toBe(capsule.stats.cellsCovered) + expect(capsule.stats.latencyMs?.n).toBe(capsule.stats.totalRuns) + }) + + it('consumer-supplied latencyMs overrides engine wall-clock', async () => { + const timed: Evaluator = async (s) => ({ + valid: true, + score: s.difficulty === 'hard' ? 0.2 : 0.9, + scores: { correctness: 0.5 }, + latencyMs: 1234, + }) + const { capsule } = await fuzzAgent({ ...base, evaluate: timed, budget: 8 }) + expect(capsule.stats.latencyMs?.median).toBe(1234) + }) + + it('per-cell cost appears only when costOf is wired, with unknown runs counted apart', async () => { + const { capsule: untracked } = await fuzzAgent({ ...base, budget: 8 }) + for (const c of untracked.coverage) expect(c.costUsd).toBeUndefined() + + let i = 0 + const { capsule: tracked } = await fuzzAgent({ + ...base, + budget: 8, + costOf: () => (i++ % 2 === 0 ? { usd: 0.01 } : null), + }) + const cells = tracked.coverage.filter((c) => c.runs > 0) + expect(cells.some((c) => c.costUsd !== undefined)).toBe(true) + const totalKnown = cells.reduce((a, c) => a + (c.costUsd ?? 0), 0) + expect(totalKnown).toBeCloseTo(tracked.stats.costUsd ?? -1, 5) + const totalUnknown = cells.reduce((a, c) => a + (c.costUnknownRuns ?? 0), 0) + expect(totalUnknown).toBe(tracked.stats.costUnknownRuns) + }) + + it('distribution() throws on an empty sample — missing data is null, never zeros', () => { + expect(() => distribution([])).toThrow(/empty sample/) + const d = distribution([3, 1, 2]) + expect(d).toEqual({ mean: 2, median: 2, p90: 3, min: 1, max: 3, n: 3 }) + }) +}) diff --git a/src/fuzz/types.ts b/src/fuzz/types.ts index ccf6aee..5bb9020 100644 --- a/src/fuzz/types.ts +++ b/src/fuzz/types.ts @@ -59,6 +59,9 @@ export interface Evaluation extends DefaultVerdict { runId?: string /** Structured labels, e.g. failure classes (`hallucination`, `refusal`). */ labels?: string[] + /** Wall-clock for the evaluation, when the consumer measures it more precisely + * than the engine can (e.g. excluding judge time). Engine-measured otherwise. */ + latencyMs?: number } /** Run the target against one scenario in a cell. */ @@ -143,16 +146,35 @@ export interface ArchiveEntry { interest: number } +/** Summary of a sample — every aggregate carries its spread, never a bare mean. */ +export interface Distribution { + mean: number + median: number + p90: number + min: number + max: number + n: number +} + /** Per-INPUT-cell coverage — the planned-vs-covered map. */ export interface CoverageCell { cell: Cell runs: number - /** Mean headline score in [0,1]; `null` when the cell was never run (honestly uncovered). */ - robustness: number | null + /** Headline score distribution in [0,1]; `null` when the cell was never run + * (honestly uncovered — never a fabricated zero). */ + score: Distribution | null /** Fraction of runs the objective flagged as notable. */ findingRate: number - /** Mean per-dimension scores — surfaces WHICH dimension is weak. */ - dimensions: Record + /** Per-dimension score distributions — surfaces WHICH dimension is weak and + * how consistently. */ + dimensions: Record + /** Evaluation wall-clock per run; engine-measured unless the evaluation + * carried its own `latencyMs`. `null` when the cell was never run. */ + latencyMs: Distribution | null + /** Known dollars spent in this cell — present only when cost tracking was + * wired; runs with unknown cost are counted apart, never folded in as $0. */ + costUsd?: number + costUnknownRuns?: number } /** The artifact every exploration produces. */ @@ -177,7 +199,12 @@ export interface CapsuleData { behaviorBinsObserved: number candidateFindings: number verifiedFindings: number - meanRobustness: number + /** Distribution of per-cell mean scores across covered cells (cells weigh + * equally — variance steering sends more runs to weak cells, so a + * run-weighted average would bias low). `null` when nothing ran. */ + robustness: Distribution | null + /** Evaluation wall-clock across all runs. `null` when nothing ran. */ + latencyMs: Distribution | null /** Known dollars spent on this exploration's runs. Present only when cost * tracking was wired (`costOf`) — absent means "not tracked", never $0. */ costUsd?: number