Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion clients/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "agent-eval-rpc"
version = "0.90.1"
version = "0.91.0"
description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
readme = "README.md"
requires-python = ">=3.10"
Expand Down
2 changes: 1 addition & 1 deletion clients/python/src/agent_eval_rpc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
try:
__version__ = version("agent-eval-rpc")
except PackageNotFoundError:
__version__ = "0.90.1"
__version__ = "0.91.0"

__all__ = [
"Client",
Expand Down
11 changes: 6 additions & 5 deletions examples/fuzz-legal-demo.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -100,17 +100,18 @@ console.log(JSON.stringify({
totalRuns: s.totalRuns,
cells: `${s.cellsCovered}/${s.cellsTotal}`,
behaviorBinsObserved: s.behaviorBinsObserved,
meanRobustness: +s.meanRobustness.toFixed(3),
robustness: s.robustness && { mean: +s.robustness.mean.toFixed(3), median: +s.robustness.median.toFixed(3), min: +s.robustness.min.toFixed(3), max: +s.robustness.max.toFixed(3) },
medianLatencyMs: s.latencyMs && Math.round(s.latencyMs.median),
candidateFindings: s.candidateFindings,
verifiedFindings: s.verifiedFindings,
weakestCells: capsule.coverage
.filter((c) => c.robustness != null)
.sort((a, b) => a.robustness - b.robustness)
.filter((c) => c.score != null)
.sort((a, b) => a.score.mean - b.score.mean)
.slice(0, 4)
.map((c) => ({
cell: Object.values(c.cell.coords).join('·'),
robustness: +c.robustness.toFixed(2),
weakestDim: Object.entries(c.dimensions).sort((a, b) => a[1] - b[1])[0]?.[0],
score: { mean: +c.score.mean.toFixed(2), min: +c.score.min.toFixed(2) },
weakestDim: Object.entries(c.dimensions).sort((a, b) => a[1].mean - b[1].mean)[0]?.[0],
})),
topFinding: capsule.findings[0]
? {
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@tangle-network/agent-eval",
"version": "0.90.1",
"version": "0.91.0",
"description": "Evaluate and improve AI agents from runs, traces, judges, and feedback. Compare candidates, cluster failures, measure lift, and gate releases.",
"homepage": "https://github.com/tangle-network/agent-eval#readme",
"repository": {
Expand Down
28 changes: 17 additions & 11 deletions src/fuzz/capsule.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
*/

import type { EvalRecord } from './cube'
import { buildCoverage } from './cube'
import type { ArchiveEntry, CapsuleData, Cell, CoverageCell, Finding } from './types'
import { buildCoverage, distribution } from './cube'
import type { ArchiveEntry, CapsuleData, Cell, CoverageCell, Distribution, Finding } from './types'

export interface BuildCapsuleInput<S> {
target: string
Expand All @@ -35,8 +35,11 @@ export interface BuildCapsuleInput<S> {
export function buildCapsule<S>(input: BuildCapsuleInput<S>): CapsuleData<S> {
const coverage = buildCoverage(input.cells, input.log, input.threshold)
const covered = coverage.filter((c) => c.runs > 0)
const meanRobustness =
covered.length === 0 ? 0 : covered.reduce((a, c) => a + (c.robustness ?? 0), 0) / covered.length
// Cells weigh equally: variance steering sends more runs to weak cells, so a
// run-weighted average would bias the headline low.
const robustness =
covered.length === 0 ? null : distribution(covered.map((c) => (c.score as Distribution).mean))
const latencyMs = input.log.length === 0 ? null : distribution(input.log.map((r) => r.latencyMs))
// Measured-descriptor bins beyond the bare input cell — observed, never planned.
const behaviorBinsObserved = input.archive.filter((e) => e.binId !== e.cell.id).length

Expand All @@ -53,7 +56,8 @@ export function buildCapsule<S>(input: BuildCapsuleInput<S>): CapsuleData<S> {
behaviorBinsObserved,
candidateFindings: input.candidateFindings,
verifiedFindings: input.findings.length,
meanRobustness,
robustness,
latencyMs,
...(input.cost
? { costUsd: input.cost.costUsd, costUnknownRuns: input.cost.costUnknownRuns }
: {}),
Expand Down Expand Up @@ -99,7 +103,7 @@ function deriveAxes(coverage: CoverageCell[]): Array<{ name: string; values: str

/** The weakest dimension chip for a cell, e.g. `safety 32%` — shown when scores exist. */
function weakestDim(c: CoverageCell): string {
const entries = Object.entries(c.dimensions)
const entries = Object.entries(c.dimensions).map(([k, d]) => [k, d.mean] as [string, number])
if (entries.length === 0) return ''
const sorted = entries.sort((a, b) => a[1] - b[1])
const w = sorted[0]
Expand All @@ -111,9 +115,9 @@ function heatmapHtml(coverage: CoverageCell[]): string {
const axes = deriveAxes(coverage)
const byId = new Map(coverage.map((c) => [c.cell.id, c]))
const tile = (c: CoverageCell | undefined, label: string): string => {
const r = c?.robustness ?? null
const title = c
? `${pct(r ?? 0)} robust · ${c.runs} runs · ${pct(c.findingRate)} flagged`
const r = c?.score?.mean ?? null
const title = c?.score
? `${pct(c.score.mean)} robust (median ${pct(c.score.median)}, min ${pct(c.score.min)}) · ${c.runs} runs · ${pct(c.findingRate)} flagged${c.latencyMs ? ` · ${(c.latencyMs.median / 1000).toFixed(1)}s median` : ''}${c.costUsd !== undefined ? ` · $${c.costUsd.toFixed(2)}` : ''}`
: 'not covered'
return `<div class="tile" style="background:${robustnessColor(r)}" title="${esc(title)}">${label ? `<span class="tl">${esc(label)}</span>` : ''}<span class="tv">${c && r != null ? pct(r) : '—'}</span>${c ? weakestDim(c) : ''}</div>`
}
Expand All @@ -136,7 +140,7 @@ function heatmapHtml(coverage: CoverageCell[]): string {
return `<div class="axis-label">rows: <b>${esc(rowAxis.name)}</b> · cols: <b>${esc(colAxis.name)}</b></div><table class="heat">${head}${rows}</table>`
}

const sorted = [...coverage].sort((a, b) => (a.robustness ?? 2) - (b.robustness ?? 2))
const sorted = [...coverage].sort((a, b) => (a.score?.mean ?? 2) - (b.score?.mean ?? 2))
return `<div class="grid">${sorted.map((c) => tile(c, Object.values(c.cell.coords).join(' · '))).join('')}</div>`
}

Expand Down Expand Up @@ -236,7 +240,9 @@ table.heat th.rh{text-align:right}
<h1>${esc(capsule.objective)} exploration · ${esc(capsule.target)}</h1>
<div class="sub">${s.totalRuns} scenarios across ${s.cellsCovered}/${s.cellsTotal} planned cells${s.behaviorBinsObserved > 0 ? ` · ${s.behaviorBinsObserved} measured behavior bins` : ''}${stamp ? ` · ${esc(stamp)}` : ''}</div>
<div class="kpis">
${kpi('mean robustness', pct(s.meanRobustness), s.meanRobustness < 0.6 ? '#e58a96' : '#5ad17a')}
${s.robustness ? kpi('robustness', `${pct(s.robustness.mean)}`, s.robustness.mean < 0.6 ? '#e58a96' : '#5ad17a') : ''}
${s.robustness ? kpi('cell spread', `${pct(s.robustness.min)}–${pct(s.robustness.max)}`) : ''}
${s.latencyMs ? kpi('median latency', `${(s.latencyMs.median / 1000).toFixed(1)}s`, s.latencyMs.p90 > 4 * s.latencyMs.median ? '#e5b566' : '#e6e6e6') : ''}
${kpi('verified findings', String(s.verifiedFindings), s.verifiedFindings > 0 ? '#e58a96' : '#5ad17a')}
${kpi('cells covered', `${s.cellsCovered}/${s.cellsTotal}`)}
${kpi('scenarios run', String(s.totalRuns))}
Expand Down
73 changes: 59 additions & 14 deletions src/fuzz/cube.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,25 @@
*
* Cells are the cartesian product of the input axes — the stratification plan,
* enumerable up front so the planned-vs-covered denominator is honest. Coverage
* is projected from the evaluation log: per cell, mean headline robustness, the
* mean of each scored dimension (so the map shows WHICH dimension is weak), and
* the rate at which the active objective flagged a candidate.
* is projected from the evaluation log: per cell, the full DISTRIBUTION of the
* headline score, of each scored dimension, and of evaluation latency — a bare
* mean hides outliers, so every aggregate carries its spread. Per-cell cost is
* split known-dollars vs unknown-runs, never folded into a fabricated $0.
*/

import type { BehaviorSpace, Cell, CoverageCell, Evaluation } from './types'
import type { BehaviorSpace, Cell, CoverageCell, Distribution, Evaluation } from './types'

/** One recorded evaluation — the unit coverage and the capsule are built from. */
export interface EvalRecord {
cell: Cell
ev: Evaluation
/** The objective's interest score for this evaluation. */
interest: number
/** Evaluation wall-clock — engine-measured unless `ev.latencyMs` overrode it. */
latencyMs: number
/** Known dollars for this run. `null` = cost tracking was wired but this
* run's cost was unknowable (counted apart). Absent = not tracked at all. */
costUsd?: number | null
}

/** Enumerate every input cell (cartesian product of the axes), in stable order. */
Expand All @@ -37,12 +43,32 @@ export function cellId(space: BehaviorSpace, coords: Record<string, string>): st
return space.axes.map((a) => `${a.name}=${coords[a.name]}`).join('|')
}

const mean = (xs: number[]): number =>
xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length
/** Nearest-rank percentile on a pre-sorted ascending sample. */
function percentile(sorted: number[], p: number): number {
const idx = Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))
return sorted[idx] as number
}

/** Summarize a sample. Throws on an empty sample — callers represent "no data"
* as `null`, never as a zeroed distribution. */
export function distribution(values: number[]): Distribution {
if (values.length === 0)
throw new Error('distribution: empty sample — represent missing data as null, not zeros')
const sorted = [...values].sort((a, b) => a - b)
const mean = sorted.reduce((a, b) => a + b, 0) / sorted.length
return {
mean,
median: percentile(sorted, 0.5),
p90: percentile(sorted, 0.9),
min: sorted[0] as number,
max: sorted[sorted.length - 1] as number,
n: sorted.length,
}
}

/**
* Project the evaluation log into the per-input-cell coverage map. A cell with
* no evaluations reports `robustness: null` (honestly uncovered), never 0.
* no evaluations reports `score: null` (honestly uncovered), never zeros.
*/
export function buildCoverage(cells: Cell[], log: EvalRecord[], threshold: number): CoverageCell[] {
const byCell = new Map<string, EvalRecord[]>()
Expand All @@ -54,17 +80,36 @@ export function buildCoverage(cells: Cell[], log: EvalRecord[], threshold: numbe
return cells.map((cell) => {
const recs = byCell.get(cell.id) ?? []
const runs = recs.length
if (runs === 0) return { cell, runs: 0, robustness: null, findingRate: 0, dimensions: {} }
const robustness = mean(recs.map((r) => r.ev.score))
if (runs === 0)
return { cell, runs: 0, score: null, findingRate: 0, dimensions: {}, latencyMs: null }

const score = distribution(recs.map((r) => r.ev.score))
const latencyMs = distribution(recs.map((r) => r.latencyMs))
const findingRate = recs.filter((r) => r.interest >= threshold).length / runs
const dims: Record<string, number[]> = {}

const dimSamples: Record<string, number[]> = {}
for (const r of recs) {
for (const [k, v] of Object.entries(r.ev.scores ?? {})) {
;(dims[k] ??= []).push(v)
;(dimSamples[k] ??= []).push(v)
}
}
const dimensions: Record<string, number> = {}
for (const [k, xs] of Object.entries(dims)) dimensions[k] = mean(xs)
return { cell, runs, robustness, findingRate, dimensions }
const dimensions: Record<string, Distribution> = {}
for (const [k, xs] of Object.entries(dimSamples)) dimensions[k] = distribution(xs)

// Cost fields appear only when tracking was wired: known dollars sum, and
// tracked-but-unknown runs counted apart — never folded in as $0.
const tracked = recs.filter((r) => r.costUsd !== undefined)
const known = tracked.filter((r) => r.costUsd !== null)
const cost =
tracked.length > 0
? {
costUsd: known.reduce((a, r) => a + (r.costUsd as number), 0),
...(tracked.length > known.length
? { costUnknownRuns: tracked.length - known.length }
: {}),
}
: {}

return { cell, runs, score, findingRate, dimensions, latencyMs, ...cost }
})
}
24 changes: 19 additions & 5 deletions src/fuzz/explorer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -156,12 +156,14 @@ export class BehaviorExplorer<S> {

/** Fold one run's cost in: null counts as unknown (never $0); a known cost
* accrues toward the budget, lands in the ledger, and fires `onCost`. */
private recordRunCost(scenario: S, cell: Cell, ev: Evaluation): void {
if (!this.opts.costOf) return
/** Returns the run's known cost, `null` when tracked-but-unknown, `undefined`
* when cost tracking is not wired — the log row mirrors this exactly. */
private recordRunCost(scenario: S, cell: Cell, ev: Evaluation): number | null | undefined {
if (!this.opts.costOf) return undefined
const cost = this.opts.costOf(scenario, cell, ev)
if (cost === null) {
this.costUnknownRuns++
return
return null
}
if (typeof cost.usd !== 'number' || !Number.isFinite(cost.usd) || cost.usd < 0) {
throw new RangeError(
Expand All @@ -178,6 +180,7 @@ export class BehaviorExplorer<S> {
tags: { target: this.opts.target, cell: cell.id },
})
this.opts.onCost?.({ usd: cost.usd, channel: 'agent' })
return cost.usd
}

/** Elites whose INPUT cell matches — what the proposer mutates/deepens from. */
Expand Down Expand Up @@ -242,13 +245,24 @@ export class BehaviorExplorer<S> {
// Consecutive failures trip the circuit breaker instead, so a dead
// backend stops the run rather than burning the remaining budget.
try {
const startedAt = performance.now()
const ev = await this.opts.evaluate(scenario, cell)
// Consumer-measured latency wins (it can exclude judge time); the
// engine's wall-clock is the default so latency is never missing.
const latencyMs = ev.latencyMs ?? performance.now() - startedAt
this.runsUsed++
runsThisStep++
this.consecutiveEvalErrors = 0
this.recordRunCost(scenario, cell, ev)
const costUsd = this.recordRunCost(scenario, cell, ev)
const interest = this.objective.interest(ev, this.objectiveContext())
this.log.push({ cell, ev, interest, scenarioId: this.opts.scenarioId(scenario) })
this.log.push({
cell,
ev,
interest,
latencyMs,
...(costUsd !== undefined ? { costUsd } : {}),
scenarioId: this.opts.scenarioId(scenario),
})
this.opts.onProgress?.({ type: 'evaluated', cell, scenario, evaluation: ev })

const bin = this.binId(cell, ev.descriptor)
Expand Down
Loading
Loading