From a17dca38d0b76fd4cf0df92869e959d831add3ee Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Wed, 10 Jun 2026 04:42:52 -0600 Subject: [PATCH] feat(diagnose): causal sweep, responsibility scoring, replay-validated repair MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New ./diagnose subpath orchestrating the dormant counterfactual primitives into a three-stage remediation chain: - causalSweep — reps x steps x mutations within a hard replay budget, composed over runCounterfactual; per-step mean effect + bootstrap CI (confidenceInterval) ranked by |meanEffect|; kind-level aggregate via attributeCounterfactuals; budget exhaustion names uncovered steps. - prescribeRepair — consumer-supplied proposeFix candidates are machine-verified by replaying WITH the mutation; a repair counts only when every validation rep crosses flipThreshold; non-flippers and replay errors land in rejected with typed reasons. - Remediation adapters into existing machinery: toAnalystFindings (makeFinding, severity from effect size, CI-gated), toCorpusRecord (pins the failure as a permanent corpus scenario, validateRunRecord at the boundary), suggestInvariant (never/without hint shape for trace contracts). Deterministic tests fake the CounterfactualRunner seam with seeded mulberry32 noise; no LLM calls. --- package.json | 5 + src/diagnose/causal-sweep.ts | 243 +++++++++++++++++ src/diagnose/index.ts | 61 +++++ src/diagnose/remediation.ts | 206 +++++++++++++++ src/diagnose/repair.ts | 200 ++++++++++++++ tests/diagnose.test.ts | 490 +++++++++++++++++++++++++++++++++++ tsup.config.ts | 1 + 7 files changed, 1206 insertions(+) create mode 100644 src/diagnose/causal-sweep.ts create mode 100644 src/diagnose/index.ts create mode 100644 src/diagnose/remediation.ts create mode 100644 src/diagnose/repair.ts create mode 100644 tests/diagnose.test.ts diff --git a/package.json b/package.json index 9d531b1..f816004 100644 --- a/package.json +++ b/package.json @@ -39,6 +39,11 @@ "import": "./dist/rl.js", "default": "./dist/rl.js" }, + "./diagnose": { + "types": "./dist/diagnose.d.ts", + "import": "./dist/diagnose.js", + "default": "./dist/diagnose.js" + }, "./traces": { "types": "./dist/traces.d.ts", "import": "./dist/traces.js", diff --git a/src/diagnose/causal-sweep.ts b/src/diagnose/causal-sweep.ts new file mode 100644 index 0000000..5e19389 --- /dev/null +++ b/src/diagnose/causal-sweep.ts @@ -0,0 +1,243 @@ +/** + * Causal sweep — WHY did this run fail? + * + * Orchestrates the dormant counterfactual primitives into a responsibility + * report: for each candidate step, run `reps` counterfactual replays per + * mutation (via `runCounterfactual` — the consumer's `CounterfactualRunner` + * is the execution seam) and reduce the per-rep score deltas into a mean + * effect + bootstrap confidence interval (via `confidenceInterval`). + * + * Why `reps` is REQUIRED: a single intervention delta is one stochastic + * draw — LLM re-execution from a prefix is sampled, so one replay cannot + * distinguish "this step caused the failure" from sampling noise. The + * signal is the distribution of deltas across reps; the CI over that + * distribution is what lets a caller say "this step's effect excludes + * zero" instead of eyeballing a point estimate. + * + * Budget discipline: the sweep never silently drops cells. When the + * remaining budget cannot fund a full `reps`-sized cell, the sweep halts + * and every step not fully probed is named in `uncovered`. + */ + +import { + attributeCounterfactuals, + type CounterfactualMutation, + type CounterfactualResult, + type CounterfactualRunner, + runCounterfactual, +} from '../counterfactual' +import { ValidationError } from '../errors' +import { confidenceInterval } from '../statistics' +import type { Span } from '../trace/schema' +import type { TraceStore } from '../trace/store' +import { buildTrajectory, type Trajectory, type TrajectoryStep } from '../trajectory' + +/** Stable reference to a trajectory step — carried through reports, + * findings, and corpus records so evidence stays addressable. */ +export interface StepRef { + index: number + spanId: string + kind: Span['kind'] + name: string +} + +export function stepRefOf(step: TrajectoryStep): StepRef { + return { + index: step.index, + spanId: step.span.spanId, + kind: step.span.kind, + name: step.span.name, + } +} + +export interface CausalSweepOptions { + store: TraceStore + /** The failed run to diagnose. Its `outcome.score` is the baseline every + * counterfactual delta is measured against. */ + runId: string + /** Execution seam — identical contract to `runCounterfactual`: re-runs the + * agent from the mutation point and MUST `endRun` with a numeric score. */ + runner: CounterfactualRunner + /** Trajectory indices to probe. Default: every llm + tool span — the kinds + * the existing `CounterfactualMutation` set targets. */ + candidateSteps?: number[] + /** + * Mutations to probe a given step with. Returned mutations MUST target + * `step.index`. Default probes are the payload-free existing kinds: + * - tool span → `swap-tool-result` with `newResult: null` (knockout: + * how much did the run depend on this tool's information?) + * - llm span → `truncate-after` (re-roll: how much did the realized + * turn deviate from the policy's typical continuation?) + * `swap-model` / `inject-system-message` need consumer payloads, so they + * are opt-in via this callback. + */ + mutationsPerStep?: (step: TrajectoryStep) => CounterfactualMutation[] + /** Replays per (step, mutation) cell. Minimum 2 — see module doc. */ + reps: number + /** Hard cap on total counterfactual replays across the whole sweep. */ + budget: number + /** Seed for the bootstrap CI resampler. Deterministic default so two + * sweeps over the same deltas report identical intervals. */ + ciSeed?: number + /** Bootstrap CI confidence level. Default 0.95. */ + ciConfidence?: number +} + +export interface StepResponsibility { + stepRef: StepRef + mutationKind: CounterfactualMutation['kind'] + /** Mean of per-rep score deltas (counterfactual − original). */ + meanEffect: number + /** Bootstrap CI over the per-rep deltas. */ + ci: { mean: number; lower: number; upper: number } + /** `ci.lower > 0 || ci.upper < 0` — the effect is distinguishable from noise. */ + ciExcludesZero: boolean + reps: number + /** Raw per-rep deltas — downstream evidence, never re-derived. */ + deltas: number[] + /** Replay run ids (layer='meta', parentRunId=original) for audit. */ + counterfactualRunIds: string[] +} + +export interface CausalResponsibilityReport { + runId: string + originalScore: number + /** Ranked by |meanEffect| descending — the blame ordering. */ + steps: StepResponsibility[] + /** Kind-level aggregate from the existing `attributeCounterfactuals`. */ + byMutationKind: ReturnType + replaysUsed: number + budget: number + /** Steps planned but not fully probed before the budget ran out. + * Named, never silent: an absent step is "no effect found"; an + * uncovered step is "not measured". */ + uncovered: StepRef[] +} + +const DEFAULT_CI_SEED = 0x5eed + +function defaultMutations(step: TrajectoryStep): CounterfactualMutation[] { + if (step.span.kind === 'tool') { + return [{ kind: 'swap-tool-result', at: step.index, newResult: null }] + } + if (step.span.kind === 'llm') { + return [{ kind: 'truncate-after', at: step.index }] + } + return [] +} + +export async function causalSweep(opts: CausalSweepOptions): Promise { + if (!Number.isInteger(opts.reps) || opts.reps < 2) { + throw new ValidationError( + `causalSweep: reps must be an integer >= 2 (got ${opts.reps}) — a single-intervention delta is one stochastic draw, not a measurement`, + ) + } + if (!Number.isInteger(opts.budget) || opts.budget < 1) { + throw new ValidationError(`causalSweep: budget must be an integer >= 1 (got ${opts.budget})`) + } + + const originalRun = await opts.store.getRun(opts.runId) + if (!originalRun) throw new ValidationError(`causalSweep: run ${opts.runId} not found`) + const originalScore = originalRun.outcome?.score + if (typeof originalScore !== 'number' || !Number.isFinite(originalScore)) { + throw new ValidationError( + `causalSweep: run ${opts.runId} has no numeric outcome.score — deltas have no baseline`, + ) + } + + const trajectory = await buildTrajectory(opts.store, opts.runId) + const candidates = resolveCandidates(trajectory, opts.candidateSteps) + const mutationsFor = opts.mutationsPerStep ?? defaultMutations + + interface Cell { + step: TrajectoryStep + mutation: CounterfactualMutation + } + const cells: Cell[] = [] + for (const step of candidates) { + const mutations = mutationsFor(step) + for (const m of mutations) { + if (m.at !== step.index) { + throw new ValidationError( + `causalSweep: mutationsPerStep returned a mutation targeting at=${m.at} for step index=${step.index} — mutations must target the step they were asked for`, + ) + } + cells.push({ step, mutation: m }) + } + } + + const responsibilities: StepResponsibility[] = [] + const allResults: CounterfactualResult[] = [] + const uncoveredIndices = new Set() + let replaysUsed = 0 + let halted = false + + for (const cell of cells) { + if (halted || replaysUsed + opts.reps > opts.budget) { + // A partial cell would report a CI over fewer reps than requested — + // weaker evidence masquerading as the real thing. Halt and name it. + halted = true + uncoveredIndices.add(cell.step.index) + continue + } + const deltas: number[] = [] + const cfRunIds: string[] = [] + for (let rep = 0; rep < opts.reps; rep++) { + const result = await runCounterfactual(opts.store, opts.runId, cell.mutation, opts.runner) + replaysUsed++ + const d = result.delta.deltaScore + if (typeof d !== 'number' || !Number.isFinite(d)) { + throw new ValidationError( + `causalSweep: counterfactual replay for step ${cell.step.index} (${cell.mutation.kind}) rep ${rep} produced no numeric score — the runner must endRun with a numeric outcome.score`, + ) + } + deltas.push(d) + cfRunIds.push(result.counterfactualRunId) + allResults.push(result) + } + const ci = confidenceInterval(deltas, opts.ciConfidence ?? 0.95, { + seed: opts.ciSeed ?? DEFAULT_CI_SEED, + }) + responsibilities.push({ + stepRef: stepRefOf(cell.step), + mutationKind: cell.mutation.kind, + meanEffect: ci.mean, + ci, + ciExcludesZero: ci.lower > 0 || ci.upper < 0, + reps: opts.reps, + deltas, + counterfactualRunIds: cfRunIds, + }) + } + + responsibilities.sort((a, b) => Math.abs(b.meanEffect) - Math.abs(a.meanEffect)) + + // A step probed under one mutation but cut off under another appears in + // BOTH steps and uncovered — partial coverage is named, not blended. + const uncovered = candidates.filter((s) => uncoveredIndices.has(s.index)).map(stepRefOf) + + return { + runId: opts.runId, + originalScore, + steps: responsibilities, + byMutationKind: attributeCounterfactuals(allResults), + replaysUsed, + budget: opts.budget, + uncovered, + } +} + +function resolveCandidates(trajectory: Trajectory, indices?: number[]): TrajectoryStep[] { + if (indices === undefined) { + return trajectory.steps.filter((s) => s.span.kind === 'llm' || s.span.kind === 'tool') + } + return indices.map((i) => { + const step = trajectory.steps[i] + if (!step) { + throw new ValidationError( + `causalSweep: candidateSteps index ${i} out of range [0, ${trajectory.steps.length})`, + ) + } + return step + }) +} diff --git a/src/diagnose/index.ts b/src/diagnose/index.ts new file mode 100644 index 0000000..df815e6 --- /dev/null +++ b/src/diagnose/index.ts @@ -0,0 +1,61 @@ +/** + * Diagnose chain — WHY a run failed, WHAT should have happened, HOW to + * make it happen. + * + * The full remediation pipeline this subpath closes: + * + * fuzz finds → sweep blames → repair prescribes (validated) → + * findings / corpus / invariant remediate → gates verify + * + * Three stages, all orchestration over existing primitives — nothing here + * re-implements replay, mutation, or attribution: + * + * 1. `causalSweep` — WHY. Runs `reps` counterfactual replays per + * (step, mutation) cell through `runCounterfactual` (the consumer's + * `CounterfactualRunner` is the execution seam) and reduces the + * per-rep deltas into a responsibility ranking with bootstrap CIs + * (`confidenceInterval`). Budget-bounded; unprobed steps are named + * in `uncovered`, never dropped. + * 2. `prescribeRepair` — WHAT SHOULD HAVE HAPPENED. Consumer-supplied + * `proposeFix` (LLM-backed in live use) proposes candidate mutations + * for the blamed steps; each candidate is machine-verified by + * replaying WITH it. Only candidates whose every validation rep + * crosses `flipThreshold` become repairs; the rest are rejected + * with a typed reason. + * 3. Remediation adapters — HOW. `toAnalystFindings` feeds the analyst + * registry, `toCorpusRecord` pins the failure as a permanent corpus + * scenario, `suggestInvariant` emits the trace-contracts hint shape. + */ + +// The execution-seam types consumers must implement live in counterfactual.ts; +// re-exported so a diagnose consumer imports from one subpath. +export type { + CounterfactualContext, + CounterfactualMutation, + CounterfactualResult, + CounterfactualRunner, +} from '../counterfactual' +export type { + CausalResponsibilityReport, + CausalSweepOptions, + StepRef, + StepResponsibility, +} from './causal-sweep' +export { causalSweep, stepRefOf } from './causal-sweep' +export type { InvariantHint } from './remediation' +export { + DIAGNOSE_ANALYST_ID, + describeMutation, + severityFromEffect, + suggestInvariant, + toAnalystFindings, + toCorpusRecord, +} from './remediation' +export type { + PrescribeRepairOptions, + RejectedRepair, + RepairContext, + RepairReport, + ValidatedRepair, +} from './repair' +export { prescribeRepair } from './repair' diff --git a/src/diagnose/remediation.ts b/src/diagnose/remediation.ts new file mode 100644 index 0000000..600e1f7 --- /dev/null +++ b/src/diagnose/remediation.ts @@ -0,0 +1,206 @@ +/** + * Remediation adapters — HOW DO WE MAKE IT HAPPEN? + * + * The diagnose chain ends by feeding existing improvement machinery, + * not by building new machinery: + * + * - `toAnalystFindings` → the analyst contract (`makeFinding`), so + * responsibility evidence flows into the same registry / steering / + * diff pipeline every other analyst feeds. + * - `toCorpusRecord` → the RL corpus (`CorpusRecord`), pinning the + * diagnosed failure + validated repair as a permanent scenario. + * - `suggestInvariant` → a plain-data hint in the shape the + * trace-contracts machinery consumes (`never` / `without` clauses). + */ + +import type { AnalystFinding, AnalystSeverity, EvidenceRef } from '../analyst/types' +import { makeFinding } from '../analyst/types' +import type { CounterfactualMutation } from '../counterfactual' +import { ValidationError } from '../errors' +import type { CorpusRecord } from '../rl/corpus' +import type { RunRecord } from '../run-record' +import { validateRunRecord } from '../run-record' +import type { CausalResponsibilityReport, StepResponsibility } from './causal-sweep' +import type { RepairReport, ValidatedRepair } from './repair' + +export const DIAGNOSE_ANALYST_ID = 'diagnose-causal-sweep' + +/** Severity from causal effect size. Effects whose CI includes zero are + * 'info' regardless of magnitude — an indistinguishable-from-noise effect + * must not steer remediation priority. */ +export function severityFromEffect(responsibility: StepResponsibility): AnalystSeverity { + if (!responsibility.ciExcludesZero) return 'info' + const magnitude = Math.abs(responsibility.meanEffect) + if (magnitude >= 0.5) return 'critical' + if (magnitude >= 0.25) return 'high' + if (magnitude >= 0.1) return 'medium' + return 'low' +} + +/** Deterministic human-readable rendering of a mutation — used in + * recommended actions, corpus completions, and invariant hints. */ +export function describeMutation(mutation: CounterfactualMutation): string { + switch (mutation.kind) { + case 'swap-model': + return `use model '${mutation.newModel}' at step ${mutation.at}` + case 'swap-tool-result': + return `replace the tool result at step ${mutation.at} with ${JSON.stringify(mutation.newResult)}` + case 'truncate-after': + return `stop the run after step ${mutation.at}` + case 'inject-system-message': + return `inject system message at step ${mutation.at}: ${mutation.content}` + case 'custom': + return `${mutation.describe} (step ${mutation.at})` + } +} + +/** + * Lift a responsibility report (and optionally its validated repairs) into + * `AnalystFinding`s via the real `makeFinding` factory. One finding per + * probed step; a validated repair for that step upgrades the finding with + * a `recommended_action` + the replay-validation evidence. + * + * Findings are OBSERVED causal probes (replay deltas), not judge verdicts, + * so `derived_from_judge` stays unset and they may steer. + */ +export function toAnalystFindings( + report: CausalResponsibilityReport, + repairs?: RepairReport, +): AnalystFinding[] { + const repairByStep = new Map() + for (const r of repairs?.repairs ?? []) { + if (!repairByStep.has(r.stepRef.spanId)) repairByStep.set(r.stepRef.spanId, r) + } + + return report.steps.map((resp) => { + const repair = repairByStep.get(resp.stepRef.spanId) + const evidence: EvidenceRef[] = [ + { + kind: 'span', + uri: `span://${resp.stepRef.spanId}`, + excerpt: `step ${resp.stepRef.index} (${resp.stepRef.kind} '${resp.stepRef.name}') meanEffect=${resp.meanEffect.toFixed(4)} ci=[${resp.ci.lower.toFixed(4)}, ${resp.ci.upper.toFixed(4)}] reps=${resp.reps}`, + }, + { + kind: 'metric', + uri: `metric://diagnose/${report.runId}/step/${resp.stepRef.index}/${resp.mutationKind}`, + excerpt: `deltas=[${resp.deltas.map((d) => d.toFixed(4)).join(', ')}]`, + }, + ...resp.counterfactualRunIds.map((id): EvidenceRef => ({ kind: 'span', uri: `run://${id}` })), + ] + return makeFinding({ + analyst_id: DIAGNOSE_ANALYST_ID, + severity: severityFromEffect(resp), + area: 'causal-attribution', + claim: `step '${resp.stepRef.name}' (${resp.stepRef.kind}) is causally responsible for the run outcome under ${resp.mutationKind}`, + rationale: resp.ciExcludesZero + ? `mean effect ${resp.meanEffect.toFixed(4)} over ${resp.reps} counterfactual replays; CI [${resp.ci.lower.toFixed(4)}, ${resp.ci.upper.toFixed(4)}] excludes zero` + : `mean effect ${resp.meanEffect.toFixed(4)} over ${resp.reps} counterfactual replays; CI [${resp.ci.lower.toFixed(4)}, ${resp.ci.upper.toFixed(4)}] includes zero — not distinguishable from noise`, + evidence_refs: evidence, + recommended_action: repair ? describeMutation(repair.mutation) : undefined, + validation_plan: repair + ? `replay-validated: ${repair.reps}/${repair.reps} reps scored >= ${repairs!.flipThreshold} (mean ${repair.meanScore.toFixed(4)}, delta ${repair.deltaScore.toFixed(4)})` + : undefined, + confidence: repair ? 0.95 : resp.ciExcludesZero ? 0.85 : 0.3, + subject: resp.stepRef.spanId, + metadata: { + stepRef: resp.stepRef, + mutationKind: resp.mutationKind, + meanEffect: resp.meanEffect, + ci: resp.ci, + deltas: resp.deltas, + counterfactualRunIds: resp.counterfactualRunIds, + ...(repair ? { repair: { mutation: repair.mutation, meanScore: repair.meanScore } } : {}), + }, + }) + }) +} + +/** + * Pin the diagnosed failure as a permanent corpus scenario. Takes the + * original run's `RunRecord` projection plus a validated repair and emits + * a fresh `CorpusRecord` (new runId, so corpus dedup keeps both the raw + * failure and the diagnosed entry). + * + * `completion` defaults to the validated mutation's rendering — "what + * should have happened" in machine-derived form. Supply `prompt` (and + * optionally a richer `completion`) when the trajectory text is available + * so the record is harvestable by `buildDatasetFromCorpus`. + */ +export function toCorpusRecord( + run: RunRecord, + repair: ValidatedRepair, + opts: { prompt?: string; completion?: string } = {}, +): CorpusRecord { + const record: CorpusRecord = { + ...run, + runId: `${run.runId}#repair:${repair.stepRef.spanId}`, + outcome: { + ...run.outcome, + raw: { + ...run.outcome.raw, + diagnose_blamed_step_index: repair.stepRef.index, + diagnose_repair_mean_score: repair.meanScore, + diagnose_repair_delta_score: repair.deltaScore, + diagnose_repair_reps: repair.reps, + }, + }, + prompt: opts.prompt, + completion: opts.completion ?? describeMutation(repair.mutation), + } + // Boundary check — a corpus record that fails RunRecord validation would + // poison every downstream harvest. + validateRunRecord(record) + return record +} + +/** Plain-data invariant hint. The trace-contracts machinery consumes this + * shape: `never` is a pattern that must not appear in a passing trace; + * `without` is a guard whose absence makes the failure reachable. */ +export interface InvariantHint { + description: string + never?: string + without?: string +} + +/** + * Derive an invariant hint from a validated repair. Deterministic per + * mutation kind — the hint names the contract a trace must satisfy so + * the diagnosed failure cannot silently recur. + */ +export function suggestInvariant(repair: ValidatedRepair): InvariantHint { + const { stepRef, mutation } = repair + const at = `step ${stepRef.index} (${stepRef.kind} '${stepRef.name}')` + switch (mutation.kind) { + case 'swap-tool-result': + return { + description: `the result of tool '${stepRef.name}' was causally responsible for the failure; a replaced result flipped the outcome (delta ${repair.deltaScore.toFixed(4)})`, + never: `unvalidated result from tool '${stepRef.name}' flows downstream`, + without: `result guard on tool '${stepRef.name}'`, + } + case 'swap-model': + return { + description: `swapping the model at ${at} to '${mutation.newModel}' flipped the outcome (delta ${repair.deltaScore.toFixed(4)})`, + never: `llm span '${stepRef.name}' runs on a model other than '${mutation.newModel}'`, + } + case 'inject-system-message': + return { + description: `injecting a system message at ${at} flipped the outcome (delta ${repair.deltaScore.toFixed(4)})`, + without: `system message present at '${stepRef.name}': ${mutation.content}`, + } + case 'truncate-after': + return { + description: `stopping after ${at} flipped the outcome (delta ${repair.deltaScore.toFixed(4)}) — continuation past this step caused the failure`, + never: `spans execute after '${stepRef.name}' (index ${stepRef.index})`, + } + case 'custom': + return { + description: `${mutation.describe} at ${at} flipped the outcome (delta ${repair.deltaScore.toFixed(4)})`, + } + default: { + const exhausted: never = mutation + throw new ValidationError( + `suggestInvariant: unknown mutation kind ${JSON.stringify(exhausted)}`, + ) + } + } +} diff --git a/src/diagnose/repair.ts b/src/diagnose/repair.ts new file mode 100644 index 0000000..451a165 --- /dev/null +++ b/src/diagnose/repair.ts @@ -0,0 +1,200 @@ +/** + * Replay-validated repair — WHAT SHOULD HAVE HAPPENED? + * + * Takes the blamed steps from a `CausalResponsibilityReport`, asks a + * consumer-supplied `proposeFix` (LLM-backed in live use) for candidate + * mutations, and machine-verifies each candidate by replaying the run + * WITH the mutation applied (through the same `runCounterfactual` seam + * the sweep uses). + * + * A repair is "what should have happened" ONLY when every validation + * replay crosses `flipThreshold` — a prescription is never speculated, + * it is demonstrated. Candidates that don't flip, or whose replay + * errors, land in `rejected` with a typed reason; nothing is dropped + * silently. + */ + +import { + type CounterfactualMutation, + type CounterfactualRunner, + runCounterfactual, +} from '../counterfactual' +import { ValidationError } from '../errors' +import type { TraceStore } from '../trace/store' +import { buildTrajectory, type Trajectory, type TrajectoryStep } from '../trajectory' +import type { StepRef, StepResponsibility } from './causal-sweep' + +/** Context handed to `proposeFix` so an LLM-backed proposer can see the + * full trajectory plus the responsibility evidence for the blamed step. */ +export interface RepairContext { + runId: string + trajectory: Trajectory + originalScore: number + responsibility: StepResponsibility +} + +export interface PrescribeRepairOptions { + store: TraceStore + /** The failed run the sweep diagnosed. */ + runId: string + /** Execution seam — same `CounterfactualRunner` contract as the sweep. */ + runner: CounterfactualRunner + /** Blamed steps from `causalSweep` — typically `report.steps.slice(0, k)`. */ + blamed: StepResponsibility[] + /** Candidate-fix generator. Consumer-supplied; LLM-backed in live use. + * Returned mutations MUST target the blamed step's index. */ + proposeFix: (step: TrajectoryStep, context: RepairContext) => Promise + /** Score every validation replay must reach for the repair to count. Default 0.5. */ + flipThreshold?: number + /** Validation replays per candidate mutation. Default 3. */ + repsToValidate?: number + /** Max candidate mutations tried per step. Default: all proposed. */ + maxAttemptsPerStep?: number +} + +export interface ValidatedRepair { + stepRef: StepRef + mutation: CounterfactualMutation + /** Always true — presence in `repairs` IS the machine-verified claim. */ + validated: true + /** Mean counterfactual score across the validation reps. */ + meanScore: number + /** meanScore − originalScore. */ + deltaScore: number + reps: number + /** Replay run ids backing the validation — audit trail. */ + counterfactualRunIds: string[] +} + +export interface RejectedRepair { + stepRef: StepRef + mutation: CounterfactualMutation + reason: 'did-not-flip' | 'error' + /** Present for 'did-not-flip': mean delta over the reps that ran. */ + deltaScore?: number + /** Present for 'error': the message, preserved for diagnosis. */ + error?: string +} + +export interface RepairReport { + runId: string + originalScore: number + flipThreshold: number + repairs: ValidatedRepair[] + rejected: RejectedRepair[] + replaysUsed: number +} + +export async function prescribeRepair(opts: PrescribeRepairOptions): Promise { + const flipThreshold = opts.flipThreshold ?? 0.5 + const repsToValidate = opts.repsToValidate ?? 3 + if (!Number.isInteger(repsToValidate) || repsToValidate < 1) { + throw new ValidationError( + `prescribeRepair: repsToValidate must be an integer >= 1 (got ${repsToValidate})`, + ) + } + const maxAttempts = opts.maxAttemptsPerStep ?? Number.POSITIVE_INFINITY + if (maxAttempts < 1) { + throw new ValidationError( + `prescribeRepair: maxAttemptsPerStep must be >= 1 (got ${opts.maxAttemptsPerStep})`, + ) + } + if (opts.blamed.length === 0) { + throw new ValidationError('prescribeRepair: blamed is empty — nothing to repair') + } + + const originalRun = await opts.store.getRun(opts.runId) + if (!originalRun) throw new ValidationError(`prescribeRepair: run ${opts.runId} not found`) + const originalScore = originalRun.outcome?.score + if (typeof originalScore !== 'number' || !Number.isFinite(originalScore)) { + throw new ValidationError( + `prescribeRepair: run ${opts.runId} has no numeric outcome.score — flips have no baseline`, + ) + } + + const trajectory = await buildTrajectory(opts.store, opts.runId) + + const repairs: ValidatedRepair[] = [] + const rejected: RejectedRepair[] = [] + let replaysUsed = 0 + + for (const responsibility of opts.blamed) { + const step = trajectory.steps[responsibility.stepRef.index] + if (!step || step.span.spanId !== responsibility.stepRef.spanId) { + throw new ValidationError( + `prescribeRepair: blamed step index=${responsibility.stepRef.index} spanId=${responsibility.stepRef.spanId} does not match run ${opts.runId} — stale report?`, + ) + } + + const candidates = await opts.proposeFix(step, { + runId: opts.runId, + trajectory, + originalScore, + responsibility, + }) + const toTry = candidates.slice(0, maxAttempts) + + for (const mutation of toTry) { + if (mutation.at !== step.index) { + throw new ValidationError( + `prescribeRepair: proposeFix returned a mutation targeting at=${mutation.at} for blamed step index=${step.index}`, + ) + } + const scores: number[] = [] + const cfRunIds: string[] = [] + let failure: string | undefined + for (let rep = 0; rep < repsToValidate; rep++) { + try { + const result = await runCounterfactual(opts.store, opts.runId, mutation, opts.runner) + replaysUsed++ + const score = result.delta.counterfactualOutcomeScore + if (typeof score !== 'number' || !Number.isFinite(score)) { + failure = `validation rep ${rep} produced no numeric score — the runner must endRun with a numeric outcome.score` + break + } + scores.push(score) + cfRunIds.push(result.counterfactualRunId) + } catch (err) { + replaysUsed++ + failure = err instanceof Error ? err.message : String(err) + break + } + } + + if (failure !== undefined) { + rejected.push({ + stepRef: responsibility.stepRef, + mutation, + reason: 'error', + error: failure, + }) + continue + } + + const meanScore = scores.reduce((a, b) => a + b, 0) / scores.length + const everyRepFlipped = scores.every((s) => s >= flipThreshold) + if (everyRepFlipped) { + repairs.push({ + stepRef: responsibility.stepRef, + mutation, + validated: true, + meanScore, + deltaScore: meanScore - originalScore, + reps: repsToValidate, + counterfactualRunIds: cfRunIds, + }) + // First validated repair per step IS the prescription; remaining + // candidates are untried, not rejected — we don't fabricate verdicts. + break + } + rejected.push({ + stepRef: responsibility.stepRef, + mutation, + reason: 'did-not-flip', + deltaScore: meanScore - originalScore, + }) + } + } + + return { runId: opts.runId, originalScore, flipThreshold, repairs, rejected, replaysUsed } +} diff --git a/tests/diagnose.test.ts b/tests/diagnose.test.ts new file mode 100644 index 0000000..14bf9c6 --- /dev/null +++ b/tests/diagnose.test.ts @@ -0,0 +1,490 @@ +import { describe, expect, it } from 'vitest' +import type { CounterfactualMutation, CounterfactualRunner } from '../src/counterfactual' +import { + causalSweep, + DIAGNOSE_ANALYST_ID, + describeMutation, + prescribeRepair, + suggestInvariant, + toAnalystFindings, + toCorpusRecord, + type ValidatedRepair, +} from '../src/diagnose' +import type { RunRecord } from '../src/run-record' +import type { ToolSpan } from '../src/trace' +import { InMemoryTraceStore, TraceEmitter } from '../src/trace' + +function mulberry32(seed: number): () => number { + let s = seed | 0 + return () => { + s = (s + 0x6d2b79f5) | 0 + let t = s + t = Math.imul(t ^ (t >>> 15), t | 1) + t ^= t + Math.imul(t ^ (t >>> 7), t | 61) + return ((t ^ (t >>> 14)) >>> 0) / 4294967296 + } +} + +async function seedRun( + store: InMemoryTraceStore, + outputScore: number, + shape: Array<{ kind: 'llm' | 'tool'; name: string; model?: string; toolName?: string }>, +): Promise { + const e = new TraceEmitter(store) + await e.startRun({ scenarioId: 's' }) + for (const s of shape) { + if (s.kind === 'llm') { + const h = await e.span({ + kind: 'llm', + name: s.name, + model: s.model ?? 'm', + messages: [], + output: 'x', + }) + await h.end() + } else { + const h = await e.span({ + kind: 'tool', + name: s.name, + toolName: s.toolName ?? s.name, + args: {}, + }) + await h.end({ result: 'rate=WRONG' } as Partial) + } + } + await e.endRun({ pass: false, score: outputScore }) + return e.runId +} + +const SHAPE = [ + { kind: 'llm' as const, name: 'plan' }, + { kind: 'tool' as const, name: 'fetch-rates' }, + { kind: 'tool' as const, name: 'format' }, + { kind: 'llm' as const, name: 'answer' }, +] + +/** + * Deterministic fake of the execution seam (same pattern as the + * runCounterfactual tests in tier2.test.ts): knocking out the faulty + * fetch-rates step (index 1) flips the run to ~0.8; every other + * intervention reproduces the original ~0.2 plus seeded noise. + */ +function makeRunner(opts: { seed: number; scoreFor?: (m: CounterfactualMutation) => number }): { + runner: CounterfactualRunner + calls: CounterfactualMutation[] +} { + const rng = mulberry32(opts.seed) + const calls: CounterfactualMutation[] = [] + const runner: CounterfactualRunner = { + async executeFrom(ctx, emitter) { + calls.push(ctx.mutation) + // Symmetric two-draw noise so per-rep deltas straddle the mean. + const noise = (rng() - 0.5) * 0.02 + (rng() - 0.5) * 0.02 + const base = + opts.scoreFor?.(ctx.mutation) ?? + (ctx.mutation.kind === 'swap-tool-result' && ctx.mutation.at === 1 ? 0.8 : 0.2) + await emitter.endRun({ pass: base >= 0.5, score: base + noise }) + }, + } + return { runner, calls } +} + +describe('causalSweep', () => { + it('ranks the injected-fault step #1 with CI excluding zero; no-effect step CI includes zero', async () => { + const store = new InMemoryTraceStore() + const runId = await seedRun(store, 0.2, SHAPE) + const { runner } = makeRunner({ seed: 42 }) + + const report = await causalSweep({ + store, + runId, + runner, + candidateSteps: [1, 2], + reps: 5, + budget: 100, + ciSeed: 7, + }) + + expect(report.steps).toHaveLength(2) + const [top, rest] = report.steps + expect(top!.stepRef.index).toBe(1) + expect(top!.stepRef.name).toBe('fetch-rates') + expect(top!.mutationKind).toBe('swap-tool-result') + expect(top!.meanEffect).toBeGreaterThan(0.5) + expect(top!.ciExcludesZero).toBe(true) + expect(top!.ci.lower).toBeGreaterThan(0) + expect(top!.deltas).toHaveLength(5) + + expect(rest!.stepRef.index).toBe(2) + expect(rest!.ciExcludesZero).toBe(false) + expect(rest!.ci.lower).toBeLessThanOrEqual(0) + expect(rest!.ci.upper).toBeGreaterThanOrEqual(0) + + expect(report.replaysUsed).toBe(10) + expect(report.uncovered).toHaveLength(0) + expect(report.originalScore).toBeCloseTo(0.2) + expect(report.byMutationKind[0]!.mutationKind).toBe('swap-tool-result') + expect(report.byMutationKind[0]!.n).toBe(10) + }) + + it('records counterfactual replays as meta runs parented to the original', async () => { + const store = new InMemoryTraceStore() + const runId = await seedRun(store, 0.2, SHAPE) + const { runner } = makeRunner({ seed: 1 }) + const report = await causalSweep({ + store, + runId, + runner, + candidateSteps: [1], + reps: 2, + budget: 10, + }) + const cfRun = await store.getRun(report.steps[0]!.counterfactualRunIds[0]!) + expect(cfRun?.parentRunId).toBe(runId) + expect(cfRun?.layer).toBe('meta') + }) + + it('names uncovered steps under a tight budget instead of silently dropping them', async () => { + const store = new InMemoryTraceStore() + const runId = await seedRun(store, 0.2, SHAPE) + const { runner } = makeRunner({ seed: 9 }) + + const report = await causalSweep({ + store, + runId, + runner, + candidateSteps: [1, 2], + reps: 4, + budget: 6, + }) + + expect(report.steps).toHaveLength(1) + expect(report.steps[0]!.stepRef.index).toBe(1) + expect(report.replaysUsed).toBe(4) + expect(report.uncovered).toHaveLength(1) + expect(report.uncovered[0]!.index).toBe(2) + expect(report.uncovered[0]!.name).toBe('format') + }) + + it('covers nothing when budget < reps — everything uncovered, zero replays', async () => { + const store = new InMemoryTraceStore() + const runId = await seedRun(store, 0.2, SHAPE) + const { runner, calls } = makeRunner({ seed: 9 }) + const report = await causalSweep({ + store, + runId, + runner, + candidateSteps: [1, 2], + reps: 4, + budget: 3, + }) + expect(report.steps).toHaveLength(0) + expect(report.replaysUsed).toBe(0) + expect(calls).toHaveLength(0) + expect(report.uncovered.map((s) => s.index)).toEqual([1, 2]) + }) + + it('defaults candidate steps to llm + tool spans with the payload-free probe kinds', async () => { + const store = new InMemoryTraceStore() + const runId = await seedRun(store, 0.2, SHAPE) + const { runner, calls } = makeRunner({ seed: 3 }) + await causalSweep({ store, runId, runner, reps: 2, budget: 100 }) + // 4 steps × 1 default mutation × 2 reps + expect(calls).toHaveLength(8) + const kinds = new Set(calls.map((c) => c.kind)) + expect(kinds).toEqual(new Set(['truncate-after', 'swap-tool-result'])) + }) + + it('rejects reps < 2 — a single intervention delta is noise, not measurement', async () => { + const store = new InMemoryTraceStore() + const runId = await seedRun(store, 0.2, SHAPE) + const { runner } = makeRunner({ seed: 3 }) + await expect(causalSweep({ store, runId, runner, reps: 1, budget: 10 })).rejects.toThrow( + /reps must be an integer >= 2/, + ) + }) + + it('fails loud when the original run has no numeric score', async () => { + const store = new InMemoryTraceStore() + const e = new TraceEmitter(store) + await e.startRun({ scenarioId: 's' }) + await e.endRun({ pass: false }) + const { runner } = makeRunner({ seed: 3 }) + await expect( + causalSweep({ store, runId: e.runId, runner, reps: 2, budget: 10 }), + ).rejects.toThrow(/no numeric outcome\.score/) + }) + + it('fails loud when a replay omits the score instead of recording a bogus delta', async () => { + const store = new InMemoryTraceStore() + const runId = await seedRun(store, 0.2, SHAPE) + const runner: CounterfactualRunner = { + async executeFrom(_ctx, emitter) { + await emitter.endRun({ pass: true }) + }, + } + await expect( + causalSweep({ store, runId, runner, candidateSteps: [1], reps: 2, budget: 10 }), + ).rejects.toThrow(/runner must endRun with a numeric outcome\.score/) + }) +}) + +describe('prescribeRepair', () => { + async function diagnosedSetup() { + const store = new InMemoryTraceStore() + const runId = await seedRun(store, 0.2, SHAPE) + const { runner } = makeRunner({ seed: 42 }) + const report = await causalSweep({ + store, + runId, + runner, + candidateSteps: [1, 2], + reps: 5, + budget: 100, + ciSeed: 7, + }) + return { store, runId, report } + } + + const goodFix: CounterfactualMutation = { + kind: 'swap-tool-result', + at: 1, + newResult: { rate: 4.5 }, + } + const badFix: CounterfactualMutation = { kind: 'swap-tool-result', at: 1, newResult: 'garbage' } + + it('emits only flipping mutations; non-flippers land in rejected with reason', async () => { + const { store, runId, report } = await diagnosedSetup() + const { runner } = makeRunner({ + seed: 11, + scoreFor: (m) => + m.kind === 'swap-tool-result' && + JSON.stringify(m.newResult) === JSON.stringify(goodFix.newResult) + ? 0.9 + : 0.3, + }) + + const repair = await prescribeRepair({ + store, + runId, + runner, + blamed: report.steps.slice(0, 1), + proposeFix: async () => [badFix, goodFix], + flipThreshold: 0.5, + repsToValidate: 3, + }) + + expect(repair.repairs).toHaveLength(1) + const validated = repair.repairs[0]! + expect(validated.validated).toBe(true) + expect(validated.mutation).toEqual(goodFix) + expect(validated.stepRef.index).toBe(1) + expect(validated.meanScore).toBeGreaterThanOrEqual(0.5) + expect(validated.deltaScore).toBeCloseTo(validated.meanScore - 0.2, 10) + expect(validated.reps).toBe(3) + expect(validated.counterfactualRunIds).toHaveLength(3) + + expect(repair.rejected).toHaveLength(1) + expect(repair.rejected[0]!.reason).toBe('did-not-flip') + expect(repair.rejected[0]!.mutation).toEqual(badFix) + expect(repair.rejected[0]!.deltaScore).toBeCloseTo(0.1, 1) + expect(repair.replaysUsed).toBe(6) + }) + + it('a repair must flip on EVERY validation rep, not on average', async () => { + const { store, runId, report } = await diagnosedSetup() + // Scores alternate 0.9 / 0.4: mean 0.65 crosses the threshold but rep 2 does not. + let call = 0 + const runner: CounterfactualRunner = { + async executeFrom(_ctx, emitter) { + call++ + await emitter.endRun({ pass: true, score: call % 2 === 1 ? 0.9 : 0.4 }) + }, + } + const repair = await prescribeRepair({ + store, + runId, + runner, + blamed: report.steps.slice(0, 1), + proposeFix: async () => [goodFix], + repsToValidate: 3, + }) + expect(repair.repairs).toHaveLength(0) + expect(repair.rejected[0]!.reason).toBe('did-not-flip') + }) + + it('replay errors become typed rejections, never silent drops', async () => { + const { store, runId, report } = await diagnosedSetup() + const { runner } = makeRunner({ seed: 5 }) + const explosive: CounterfactualMutation = { + kind: 'custom', + at: 1, + describe: 'patch the parser', + apply: () => { + throw new Error('boom: parser patch unapplicable') + }, + } + const repair = await prescribeRepair({ + store, + runId, + runner, + blamed: report.steps.slice(0, 1), + proposeFix: async () => [explosive], + }) + expect(repair.repairs).toHaveLength(0) + expect(repair.rejected).toHaveLength(1) + expect(repair.rejected[0]!.reason).toBe('error') + expect(repair.rejected[0]!.error).toMatch(/boom/) + }) + + it('respects maxAttemptsPerStep', async () => { + const { store, runId, report } = await diagnosedSetup() + const { runner } = makeRunner({ seed: 5, scoreFor: () => 0.3 }) + const repair = await prescribeRepair({ + store, + runId, + runner, + blamed: report.steps.slice(0, 1), + proposeFix: async () => [badFix, goodFix], + maxAttemptsPerStep: 1, + }) + expect(repair.repairs).toHaveLength(0) + expect(repair.rejected).toHaveLength(1) + expect(repair.rejected[0]!.mutation).toEqual(badFix) + }) + + it('rejects a stale report whose stepRef does not match the run', async () => { + const { store, runId, report } = await diagnosedSetup() + const { runner } = makeRunner({ seed: 5 }) + const stale = { ...report.steps[0]!, stepRef: { ...report.steps[0]!.stepRef, spanId: 'nope' } } + await expect( + prescribeRepair({ + store, + runId, + runner, + blamed: [stale], + proposeFix: async () => [goodFix], + }), + ).rejects.toThrow(/does not match run/) + }) +}) + +describe('remediation adapters', () => { + async function fullChain() { + const store = new InMemoryTraceStore() + const runId = await seedRun(store, 0.2, SHAPE) + const sweep = makeRunner({ seed: 42 }) + const report = await causalSweep({ + store, + runId, + runner: sweep.runner, + candidateSteps: [1, 2], + reps: 5, + budget: 100, + ciSeed: 7, + }) + const fix: CounterfactualMutation = { + kind: 'swap-tool-result', + at: 1, + newResult: { rate: 4.5 }, + } + const validate = makeRunner({ seed: 11, scoreFor: () => 0.9 }) + const repairs = await prescribeRepair({ + store, + runId, + runner: validate.runner, + blamed: report.steps.slice(0, 1), + proposeFix: async () => [fix], + }) + return { report, repairs } + } + + it('toAnalystFindings emits schema-valid findings with effect-scaled severity', async () => { + const { report, repairs } = await fullChain() + const findings = toAnalystFindings(report, repairs) + expect(findings).toHaveLength(2) + + for (const f of findings) { + expect(f.schema_version).toBe('1.0.0') + expect(f.finding_id).toMatch(/^f_[0-9a-f]{20}$/) + expect(f.analyst_id).toBe(DIAGNOSE_ANALYST_ID) + expect(f.area).toBe('causal-attribution') + expect(f.evidence_refs.length).toBeGreaterThanOrEqual(2) + expect(f.derived_from_judge).toBeUndefined() + } + + const blamed = findings.find((f) => f.subject === report.steps[0]!.stepRef.spanId)! + expect(blamed.severity).toBe('critical') + expect(blamed.confidence).toBe(0.95) + expect(blamed.recommended_action).toBe(describeMutation(repairs.repairs[0]!.mutation)) + expect(blamed.validation_plan).toMatch(/replay-validated: 3\/3 reps scored >= 0\.5/) + expect(blamed.evidence_refs[0]!.uri).toBe(`span://${report.steps[0]!.stepRef.spanId}`) + expect(blamed.evidence_refs[1]!.excerpt).toContain('deltas=[') + + const noise = findings.find((f) => f.subject === report.steps[1]!.stepRef.spanId)! + expect(noise.severity).toBe('info') + expect(noise.confidence).toBe(0.3) + expect(noise.recommended_action).toBeUndefined() + }) + + it('toCorpusRecord pins the failure as a fresh, schema-valid corpus scenario', async () => { + const { repairs } = await fullChain() + const original: RunRecord = { + runId: 'run-original', + experimentId: 'exp-1', + candidateId: 'cand-1', + seed: 42, + model: 'test-model@2026-01-01', + promptHash: 'p'.repeat(8), + configHash: 'c'.repeat(8), + commitSha: 'deadbeef', + wallMs: 1200, + costUsd: 0.01, + tokenUsage: { input: 100, output: 50 }, + outcome: { searchScore: 0.2, raw: {} }, + splitTag: 'search', + } + const repair = repairs.repairs[0]! + const pinned = toCorpusRecord(original, repair, { prompt: 'fetch the current rates' }) + + expect(pinned.runId).toBe(`run-original#repair:${repair.stepRef.spanId}`) + expect(pinned.runId).not.toBe(original.runId) + expect(pinned.prompt).toBe('fetch the current rates') + expect(pinned.completion).toBe(describeMutation(repair.mutation)) + expect(pinned.outcome.raw.diagnose_blamed_step_index).toBe(1) + expect(pinned.outcome.raw.diagnose_repair_mean_score).toBeCloseTo(repair.meanScore, 10) + expect(pinned.outcome.raw.diagnose_repair_delta_score).toBeCloseTo(repair.deltaScore, 10) + // Original record untouched. + expect(original.outcome.raw.diagnose_blamed_step_index).toBeUndefined() + }) + + it('suggestInvariant derives never/without clauses per mutation kind', async () => { + const { repairs } = await fullChain() + const toolHint = suggestInvariant(repairs.repairs[0]!) + expect(toolHint.description).toContain('fetch-rates') + expect(toolHint.never).toContain("tool 'fetch-rates'") + expect(toolHint.without).toContain("tool 'fetch-rates'") + + const base = repairs.repairs[0]! + const truncate: ValidatedRepair = { + ...base, + mutation: { kind: 'truncate-after', at: 1 }, + } + const truncateHint = suggestInvariant(truncate) + expect(truncateHint.never).toContain('after') + expect(truncateHint.without).toBeUndefined() + + const inject: ValidatedRepair = { + ...base, + mutation: { kind: 'inject-system-message', at: 1, content: 'always validate rates' }, + } + const injectHint = suggestInvariant(inject) + expect(injectHint.without).toContain('always validate rates') + + const swapModel: ValidatedRepair = { + ...base, + mutation: { kind: 'swap-model', at: 1, newModel: 'better-model@2026-01-01' }, + } + expect(suggestInvariant(swapModel).never).toContain('better-model@2026-01-01') + }) +}) diff --git a/tsup.config.ts b/tsup.config.ts index 2e69b76..55aac36 100644 --- a/tsup.config.ts +++ b/tsup.config.ts @@ -7,6 +7,7 @@ export default defineConfig({ control: 'src/control.ts', reporting: 'src/reporting.ts', rl: 'src/rl/index.ts', + diagnose: 'src/diagnose/index.ts', traces: 'src/traces.ts', 'telemetry/index': 'src/telemetry/index.ts', 'telemetry/file': 'src/telemetry/sink-file.ts',