From a17dca38d0b76fd4cf0df92869e959d831add3ee Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Wed, 10 Jun 2026 04:42:52 -0600
Subject: [PATCH] feat(diagnose): causal sweep, responsibility scoring,
 replay-validated repair
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New ./diagnose subpath orchestrating the dormant counterfactual
primitives into a three-stage remediation chain:

- causalSweep — reps x steps x mutations within a hard replay budget,
  composed over runCounterfactual; per-step mean effect + bootstrap CI
  (confidenceInterval) ranked by |meanEffect|; kind-level aggregate via
  attributeCounterfactuals; budget exhaustion names uncovered steps.
- prescribeRepair — consumer-supplied proposeFix candidates are
  machine-verified by replaying WITH the mutation; a repair counts only
  when every validation rep crosses flipThreshold; non-flippers and
  replay errors land in rejected with typed reasons.
- Remediation adapters into existing machinery: toAnalystFindings
  (makeFinding, severity from effect size, CI-gated), toCorpusRecord
  (pins the failure as a permanent corpus scenario, validateRunRecord
  at the boundary), suggestInvariant (never/without hint shape for
  trace contracts).

Deterministic tests fake the CounterfactualRunner seam with seeded
mulberry32 noise; no LLM calls.
---
 package.json                 |   5 +
 src/diagnose/causal-sweep.ts | 243 +++++++++++++++++
 src/diagnose/index.ts        |  61 +++++
 src/diagnose/remediation.ts  | 206 +++++++++++++++
 src/diagnose/repair.ts       | 200 ++++++++++++++
 tests/diagnose.test.ts       | 490 +++++++++++++++++++++++++++++++++++
 tsup.config.ts               |   1 +
 7 files changed, 1206 insertions(+)
 create mode 100644 src/diagnose/causal-sweep.ts
 create mode 100644 src/diagnose/index.ts
 create mode 100644 src/diagnose/remediation.ts
 create mode 100644 src/diagnose/repair.ts
 create mode 100644 tests/diagnose.test.ts

diff --git a/package.json b/package.json
index 9d531b1..f816004 100644
--- a/package.json
+++ b/package.json
@@ -39,6 +39,11 @@
       "import": "./dist/rl.js",
       "default": "./dist/rl.js"
     },
+    "./diagnose": {
+      "types": "./dist/diagnose.d.ts",
+      "import": "./dist/diagnose.js",
+      "default": "./dist/diagnose.js"
+    },
     "./traces": {
       "types": "./dist/traces.d.ts",
       "import": "./dist/traces.js",
diff --git a/src/diagnose/causal-sweep.ts b/src/diagnose/causal-sweep.ts
new file mode 100644
index 0000000..5e19389
--- /dev/null
+++ b/src/diagnose/causal-sweep.ts
@@ -0,0 +1,243 @@
+/**
+ * Causal sweep — WHY did this run fail?
+ *
+ * Orchestrates the dormant counterfactual primitives into a responsibility
+ * report: for each candidate step, run `reps` counterfactual replays per
+ * mutation (via `runCounterfactual` — the consumer's `CounterfactualRunner`
+ * is the execution seam) and reduce the per-rep score deltas into a mean
+ * effect + bootstrap confidence interval (via `confidenceInterval`).
+ *
+ * Why `reps` is REQUIRED: a single intervention delta is one stochastic
+ * draw — LLM re-execution from a prefix is sampled, so one replay cannot
+ * distinguish "this step caused the failure" from sampling noise. The
+ * signal is the distribution of deltas across reps; the CI over that
+ * distribution is what lets a caller say "this step's effect excludes
+ * zero" instead of eyeballing a point estimate.
+ *
+ * Budget discipline: the sweep never silently drops cells. When the
+ * remaining budget cannot fund a full `reps`-sized cell, the sweep halts
+ * and every step not fully probed is named in `uncovered`.
+ */
+
+import {
+  attributeCounterfactuals,
+  type CounterfactualMutation,
+  type CounterfactualResult,
+  type CounterfactualRunner,
+  runCounterfactual,
+} from '../counterfactual'
+import { ValidationError } from '../errors'
+import { confidenceInterval } from '../statistics'
+import type { Span } from '../trace/schema'
+import type { TraceStore } from '../trace/store'
+import { buildTrajectory, type Trajectory, type TrajectoryStep } from '../trajectory'
+
+/** Stable reference to a trajectory step — carried through reports,
+ *  findings, and corpus records so evidence stays addressable. */
+export interface StepRef {
+  index: number
+  spanId: string
+  kind: Span['kind']
+  name: string
+}
+
+export function stepRefOf(step: TrajectoryStep): StepRef {
+  return {
+    index: step.index,
+    spanId: step.span.spanId,
+    kind: step.span.kind,
+    name: step.span.name,
+  }
+}
+
+export interface CausalSweepOptions {
+  store: TraceStore
+  /** The failed run to diagnose. Its `outcome.score` is the baseline every
+   *  counterfactual delta is measured against. */
+  runId: string
+  /** Execution seam — identical contract to `runCounterfactual`: re-runs the
+   *  agent from the mutation point and MUST `endRun` with a numeric score. */
+  runner: CounterfactualRunner
+  /** Trajectory indices to probe. Default: every llm + tool span — the kinds
+   *  the existing `CounterfactualMutation` set targets. */
+  candidateSteps?: number[]
+  /**
+   * Mutations to probe a given step with. Returned mutations MUST target
+   * `step.index`. Default probes are the payload-free existing kinds:
+   *   - tool span → `swap-tool-result` with `newResult: null` (knockout:
+   *     how much did the run depend on this tool's information?)
+   *   - llm span → `truncate-after` (re-roll: how much did the realized
+   *     turn deviate from the policy's typical continuation?)
+   * `swap-model` / `inject-system-message` need consumer payloads, so they
+   * are opt-in via this callback.
+   */
+  mutationsPerStep?: (step: TrajectoryStep) => CounterfactualMutation[]
+  /** Replays per (step, mutation) cell. Minimum 2 — see module doc. */
+  reps: number
+  /** Hard cap on total counterfactual replays across the whole sweep. */
+  budget: number
+  /** Seed for the bootstrap CI resampler. Deterministic default so two
+   *  sweeps over the same deltas report identical intervals. */
+  ciSeed?: number
+  /** Bootstrap CI confidence level. Default 0.95. */
+  ciConfidence?: number
+}
+
+export interface StepResponsibility {
+  stepRef: StepRef
+  mutationKind: CounterfactualMutation['kind']
+  /** Mean of per-rep score deltas (counterfactual − original). */
+  meanEffect: number
+  /** Bootstrap CI over the per-rep deltas. */
+  ci: { mean: number; lower: number; upper: number }
+  /** `ci.lower > 0 || ci.upper < 0` — the effect is distinguishable from noise. */
+  ciExcludesZero: boolean
+  reps: number
+  /** Raw per-rep deltas — downstream evidence, never re-derived. */
+  deltas: number[]
+  /** Replay run ids (layer='meta', parentRunId=original) for audit. */
+  counterfactualRunIds: string[]
+}
+
+export interface CausalResponsibilityReport {
+  runId: string
+  originalScore: number
+  /** Ranked by |meanEffect| descending — the blame ordering. */
+  steps: StepResponsibility[]
+  /** Kind-level aggregate from the existing `attributeCounterfactuals`. */
+  byMutationKind: ReturnType<typeof attributeCounterfactuals>
+  replaysUsed: number
+  budget: number
+  /** Steps planned but not fully probed before the budget ran out.
+   *  Named, never silent: an absent step is "no effect found"; an
+   *  uncovered step is "not measured". */
+  uncovered: StepRef[]
+}
+
+const DEFAULT_CI_SEED = 0x5eed
+
+function defaultMutations(step: TrajectoryStep): CounterfactualMutation[] {
+  if (step.span.kind === 'tool') {
+    return [{ kind: 'swap-tool-result', at: step.index, newResult: null }]
+  }
+  if (step.span.kind === 'llm') {
+    return [{ kind: 'truncate-after', at: step.index }]
+  }
+  return []
+}
+
+export async function causalSweep(opts: CausalSweepOptions): Promise<CausalResponsibilityReport> {
+  if (!Number.isInteger(opts.reps) || opts.reps < 2) {
+    throw new ValidationError(
+      `causalSweep: reps must be an integer >= 2 (got ${opts.reps}) — a single-intervention delta is one stochastic draw, not a measurement`,
+    )
+  }
+  if (!Number.isInteger(opts.budget) || opts.budget < 1) {
+    throw new ValidationError(`causalSweep: budget must be an integer >= 1 (got ${opts.budget})`)
+  }
+
+  const originalRun = await opts.store.getRun(opts.runId)
+  if (!originalRun) throw new ValidationError(`causalSweep: run ${opts.runId} not found`)
+  const originalScore = originalRun.outcome?.score
+  if (typeof originalScore !== 'number' || !Number.isFinite(originalScore)) {
+    throw new ValidationError(
+      `causalSweep: run ${opts.runId} has no numeric outcome.score — deltas have no baseline`,
+    )
+  }
+
+  const trajectory = await buildTrajectory(opts.store, opts.runId)
+  const candidates = resolveCandidates(trajectory, opts.candidateSteps)
+  const mutationsFor = opts.mutationsPerStep ?? defaultMutations
+
+  interface Cell {
+    step: TrajectoryStep
+    mutation: CounterfactualMutation
+  }
+  const cells: Cell[] = []
+  for (const step of candidates) {
+    const mutations = mutationsFor(step)
+    for (const m of mutations) {
+      if (m.at !== step.index) {
+        throw new ValidationError(
+          `causalSweep: mutationsPerStep returned a mutation targeting at=${m.at} for step index=${step.index} — mutations must target the step they were asked for`,
+        )
+      }
+      cells.push({ step, mutation: m })
+    }
+  }
+
+  const responsibilities: StepResponsibility[] = []
+  const allResults: CounterfactualResult[] = []
+  const uncoveredIndices = new Set<number>()
+  let replaysUsed = 0
+  let halted = false
+
+  for (const cell of cells) {
+    if (halted || replaysUsed + opts.reps > opts.budget) {
+      // A partial cell would report a CI over fewer reps than requested —
+      // weaker evidence masquerading as the real thing. Halt and name it.
+      halted = true
+      uncoveredIndices.add(cell.step.index)
+      continue
+    }
+    const deltas: number[] = []
+    const cfRunIds: string[] = []
+    for (let rep = 0; rep < opts.reps; rep++) {
+      const result = await runCounterfactual(opts.store, opts.runId, cell.mutation, opts.runner)
+      replaysUsed++
+      const d = result.delta.deltaScore
+      if (typeof d !== 'number' || !Number.isFinite(d)) {
+        throw new ValidationError(
+          `causalSweep: counterfactual replay for step ${cell.step.index} (${cell.mutation.kind}) rep ${rep} produced no numeric score — the runner must endRun with a numeric outcome.score`,
+        )
+      }
+      deltas.push(d)
+      cfRunIds.push(result.counterfactualRunId)
+      allResults.push(result)
+    }
+    const ci = confidenceInterval(deltas, opts.ciConfidence ?? 0.95, {
+      seed: opts.ciSeed ?? DEFAULT_CI_SEED,
+    })
+    responsibilities.push({
+      stepRef: stepRefOf(cell.step),
+      mutationKind: cell.mutation.kind,
+      meanEffect: ci.mean,
+      ci,
+      ciExcludesZero: ci.lower > 0 || ci.upper < 0,
+      reps: opts.reps,
+      deltas,
+      counterfactualRunIds: cfRunIds,
+    })
+  }
+
+  responsibilities.sort((a, b) => Math.abs(b.meanEffect) - Math.abs(a.meanEffect))
+
+  // A step probed under one mutation but cut off under another appears in
+  // BOTH steps and uncovered — partial coverage is named, not blended.
+  const uncovered = candidates.filter((s) => uncoveredIndices.has(s.index)).map(stepRefOf)
+
+  return {
+    runId: opts.runId,
+    originalScore,
+    steps: responsibilities,
+    byMutationKind: attributeCounterfactuals(allResults),
+    replaysUsed,
+    budget: opts.budget,
+    uncovered,
+  }
+}
+
+function resolveCandidates(trajectory: Trajectory, indices?: number[]): TrajectoryStep[] {
+  if (indices === undefined) {
+    return trajectory.steps.filter((s) => s.span.kind === 'llm' || s.span.kind === 'tool')
+  }
+  return indices.map((i) => {
+    const step = trajectory.steps[i]
+    if (!step) {
+      throw new ValidationError(
+        `causalSweep: candidateSteps index ${i} out of range [0, ${trajectory.steps.length})`,
+      )
+    }
+    return step
+  })
+}
diff --git a/src/diagnose/index.ts b/src/diagnose/index.ts
new file mode 100644
index 0000000..df815e6
--- /dev/null
+++ b/src/diagnose/index.ts
@@ -0,0 +1,61 @@
+/**
+ * Diagnose chain — WHY a run failed, WHAT should have happened, HOW to
+ * make it happen.
+ *
+ * The full remediation pipeline this subpath closes:
+ *
+ *   fuzz finds → sweep blames → repair prescribes (validated) →
+ *   findings / corpus / invariant remediate → gates verify
+ *
+ * Three stages, all orchestration over existing primitives — nothing here
+ * re-implements replay, mutation, or attribution:
+ *
+ *   1. `causalSweep` — WHY. Runs `reps` counterfactual replays per
+ *      (step, mutation) cell through `runCounterfactual` (the consumer's
+ *      `CounterfactualRunner` is the execution seam) and reduces the
+ *      per-rep deltas into a responsibility ranking with bootstrap CIs
+ *      (`confidenceInterval`). Budget-bounded; unprobed steps are named
+ *      in `uncovered`, never dropped.
+ *   2. `prescribeRepair` — WHAT SHOULD HAVE HAPPENED. Consumer-supplied
+ *      `proposeFix` (LLM-backed in live use) proposes candidate mutations
+ *      for the blamed steps; each candidate is machine-verified by
+ *      replaying WITH it. Only candidates whose every validation rep
+ *      crosses `flipThreshold` become repairs; the rest are rejected
+ *      with a typed reason.
+ *   3. Remediation adapters — HOW. `toAnalystFindings` feeds the analyst
+ *      registry, `toCorpusRecord` pins the failure as a permanent corpus
+ *      scenario, `suggestInvariant` emits the trace-contracts hint shape.
+ */
+
+// The execution-seam types consumers must implement live in counterfactual.ts;
+// re-exported so a diagnose consumer imports from one subpath.
+export type {
+  CounterfactualContext,
+  CounterfactualMutation,
+  CounterfactualResult,
+  CounterfactualRunner,
+} from '../counterfactual'
+export type {
+  CausalResponsibilityReport,
+  CausalSweepOptions,
+  StepRef,
+  StepResponsibility,
+} from './causal-sweep'
+export { causalSweep, stepRefOf } from './causal-sweep'
+export type { InvariantHint } from './remediation'
+export {
+  DIAGNOSE_ANALYST_ID,
+  describeMutation,
+  severityFromEffect,
+  suggestInvariant,
+  toAnalystFindings,
+  toCorpusRecord,
+} from './remediation'
+export type {
+  PrescribeRepairOptions,
+  RejectedRepair,
+  RepairContext,
+  RepairReport,
+  ValidatedRepair,
+} from './repair'
+export { prescribeRepair } from './repair'
diff --git a/src/diagnose/remediation.ts b/src/diagnose/remediation.ts
new file mode 100644
index 0000000..600e1f7
--- /dev/null
+++ b/src/diagnose/remediation.ts
@@ -0,0 +1,206 @@
+/**
+ * Remediation adapters — HOW DO WE MAKE IT HAPPEN?
+ *
+ * The diagnose chain ends by feeding existing improvement machinery,
+ * not by building new machinery:
+ *
+ *   - `toAnalystFindings` → the analyst contract (`makeFinding`), so
+ *     responsibility evidence flows into the same registry / steering /
+ *     diff pipeline every other analyst feeds.
+ *   - `toCorpusRecord` → the RL corpus (`CorpusRecord`), pinning the
+ *     diagnosed failure + validated repair as a permanent scenario.
+ *   - `suggestInvariant` → a plain-data hint in the shape the
+ *     trace-contracts machinery consumes (`never` / `without` clauses).
+ */
+
+import type { AnalystFinding, AnalystSeverity, EvidenceRef } from '../analyst/types'
+import { makeFinding } from '../analyst/types'
+import type { CounterfactualMutation } from '../counterfactual'
+import { ValidationError } from '../errors'
+import type { CorpusRecord } from '../rl/corpus'
+import type { RunRecord } from '../run-record'
+import { validateRunRecord } from '../run-record'
+import type { CausalResponsibilityReport, StepResponsibility } from './causal-sweep'
+import type { RepairReport, ValidatedRepair } from './repair'
+
+export const DIAGNOSE_ANALYST_ID = 'diagnose-causal-sweep'
+
+/** Severity from causal effect size. Effects whose CI includes zero are
+ *  'info' regardless of magnitude — an indistinguishable-from-noise effect
+ *  must not steer remediation priority. */
+export function severityFromEffect(responsibility: StepResponsibility): AnalystSeverity {
+  if (!responsibility.ciExcludesZero) return 'info'
+  const magnitude = Math.abs(responsibility.meanEffect)
+  if (magnitude >= 0.5) return 'critical'
+  if (magnitude >= 0.25) return 'high'
+  if (magnitude >= 0.1) return 'medium'
+  return 'low'
+}
+
+/** Deterministic human-readable rendering of a mutation — used in
+ *  recommended actions, corpus completions, and invariant hints. */
+export function describeMutation(mutation: CounterfactualMutation): string {
+  switch (mutation.kind) {
+    case 'swap-model':
+      return `use model '${mutation.newModel}' at step ${mutation.at}`
+    case 'swap-tool-result':
+      return `replace the tool result at step ${mutation.at} with ${JSON.stringify(mutation.newResult)}`
+    case 'truncate-after':
+      return `stop the run after step ${mutation.at}`
+    case 'inject-system-message':
+      return `inject system message at step ${mutation.at}: ${mutation.content}`
+    case 'custom':
+      return `${mutation.describe} (step ${mutation.at})`
+  }
+}
+
+/**
+ * Lift a responsibility report (and optionally its validated repairs) into
+ * `AnalystFinding`s via the real `makeFinding` factory. One finding per
+ * probed step; a validated repair for that step upgrades the finding with
+ * a `recommended_action` + the replay-validation evidence.
+ *
+ * Findings are OBSERVED causal probes (replay deltas), not judge verdicts,
+ * so `derived_from_judge` stays unset and they may steer.
+ */
+export function toAnalystFindings(
+  report: CausalResponsibilityReport,
+  repairs?: RepairReport,
+): AnalystFinding[] {
+  const repairByStep = new Map<string, ValidatedRepair>()
+  for (const r of repairs?.repairs ?? []) {
+    if (!repairByStep.has(r.stepRef.spanId)) repairByStep.set(r.stepRef.spanId, r)
+  }
+
+  return report.steps.map((resp) => {
+    const repair = repairByStep.get(resp.stepRef.spanId)
+    const evidence: EvidenceRef[] = [
+      {
+        kind: 'span',
+        uri: `span://${resp.stepRef.spanId}`,
+        excerpt: `step ${resp.stepRef.index} (${resp.stepRef.kind} '${resp.stepRef.name}') meanEffect=${resp.meanEffect.toFixed(4)} ci=[${resp.ci.lower.toFixed(4)}, ${resp.ci.upper.toFixed(4)}] reps=${resp.reps}`,
+      },
+      {
+        kind: 'metric',
+        uri: `metric://diagnose/${report.runId}/step/${resp.stepRef.index}/${resp.mutationKind}`,
+        excerpt: `deltas=[${resp.deltas.map((d) => d.toFixed(4)).join(', ')}]`,
+      },
+      ...resp.counterfactualRunIds.map((id): EvidenceRef => ({ kind: 'span', uri: `run://${id}` })),
+    ]
+    return makeFinding({
+      analyst_id: DIAGNOSE_ANALYST_ID,
+      severity: severityFromEffect(resp),
+      area: 'causal-attribution',
+      claim: `step '${resp.stepRef.name}' (${resp.stepRef.kind}) is causally responsible for the run outcome under ${resp.mutationKind}`,
+      rationale: resp.ciExcludesZero
+        ? `mean effect ${resp.meanEffect.toFixed(4)} over ${resp.reps} counterfactual replays; CI [${resp.ci.lower.toFixed(4)}, ${resp.ci.upper.toFixed(4)}] excludes zero`
+        : `mean effect ${resp.meanEffect.toFixed(4)} over ${resp.reps} counterfactual replays; CI [${resp.ci.lower.toFixed(4)}, ${resp.ci.upper.toFixed(4)}] includes zero — not distinguishable from noise`,
+      evidence_refs: evidence,
+      recommended_action: repair ? describeMutation(repair.mutation) : undefined,
+      validation_plan: repair
+        ? `replay-validated: ${repair.reps}/${repair.reps} reps scored >= ${repairs!.flipThreshold} (mean ${repair.meanScore.toFixed(4)}, delta ${repair.deltaScore.toFixed(4)})`
+        : undefined,
+      confidence: repair ? 0.95 : resp.ciExcludesZero ? 0.85 : 0.3,
+      subject: resp.stepRef.spanId,
+      metadata: {
+        stepRef: resp.stepRef,
+        mutationKind: resp.mutationKind,
+        meanEffect: resp.meanEffect,
+        ci: resp.ci,
+        deltas: resp.deltas,
+        counterfactualRunIds: resp.counterfactualRunIds,
+        ...(repair ? { repair: { mutation: repair.mutation, meanScore: repair.meanScore } } : {}),
+      },
+    })
+  })
+}
+
+/**
+ * Pin the diagnosed failure as a permanent corpus scenario. Takes the
+ * original run's `RunRecord` projection plus a validated repair and emits
+ * a fresh `CorpusRecord` (new runId, so corpus dedup keeps both the raw
+ * failure and the diagnosed entry).
+ *
+ * `completion` defaults to the validated mutation's rendering — "what
+ * should have happened" in machine-derived form. Supply `prompt` (and
+ * optionally a richer `completion`) when the trajectory text is available
+ * so the record is harvestable by `buildDatasetFromCorpus`.
+ */
+export function toCorpusRecord(
+  run: RunRecord,
+  repair: ValidatedRepair,
+  opts: { prompt?: string; completion?: string } = {},
+): CorpusRecord {
+  const record: CorpusRecord = {
+    ...run,
+    runId: `${run.runId}#repair:${repair.stepRef.spanId}`,
+    outcome: {
+      ...run.outcome,
+      raw: {
+        ...run.outcome.raw,
+        diagnose_blamed_step_index: repair.stepRef.index,
+        diagnose_repair_mean_score: repair.meanScore,
+        diagnose_repair_delta_score: repair.deltaScore,
+        diagnose_repair_reps: repair.reps,
+      },
+    },
+    prompt: opts.prompt,
+    completion: opts.completion ?? describeMutation(repair.mutation),
+  }
+  // Boundary check — a corpus record that fails RunRecord validation would
+  // poison every downstream harvest.
+  validateRunRecord(record)
+  return record
+}
+
+/** Plain-data invariant hint. The trace-contracts machinery consumes this
+ *  shape: `never` is a pattern that must not appear in a passing trace;
+ *  `without` is a guard whose absence makes the failure reachable. */
+export interface InvariantHint {
+  description: string
+  never?: string
+  without?: string
+}
+
+/**
+ * Derive an invariant hint from a validated repair. Deterministic per
+ * mutation kind — the hint names the contract a trace must satisfy so
+ * the diagnosed failure cannot silently recur.
+ */
+export function suggestInvariant(repair: ValidatedRepair): InvariantHint {
+  const { stepRef, mutation } = repair
+  const at = `step ${stepRef.index} (${stepRef.kind} '${stepRef.name}')`
+  switch (mutation.kind) {
+    case 'swap-tool-result':
+      return {
+        description: `the result of tool '${stepRef.name}' was causally responsible for the failure; a replaced result flipped the outcome (delta ${repair.deltaScore.toFixed(4)})`,
+        never: `unvalidated result from tool '${stepRef.name}' flows downstream`,
+        without: `result guard on tool '${stepRef.name}'`,
+      }
+    case 'swap-model':
+      return {
+        description: `swapping the model at ${at} to '${mutation.newModel}' flipped the outcome (delta ${repair.deltaScore.toFixed(4)})`,
+        never: `llm span '${stepRef.name}' runs on a model other than '${mutation.newModel}'`,
+      }
+    case 'inject-system-message':
+      return {
+        description: `injecting a system message at ${at} flipped the outcome (delta ${repair.deltaScore.toFixed(4)})`,
+        without: `system message present at '${stepRef.name}': ${mutation.content}`,
+      }
+    case 'truncate-after':
+      return {
+        description: `stopping after ${at} flipped the outcome (delta ${repair.deltaScore.toFixed(4)}) — continuation past this step caused the failure`,
+        never: `spans execute after '${stepRef.name}' (index ${stepRef.index})`,
+      }
+    case 'custom':
+      return {
+        description: `${mutation.describe} at ${at} flipped the outcome (delta ${repair.deltaScore.toFixed(4)})`,
+      }
+    default: {
+      const exhausted: never = mutation
+      throw new ValidationError(
+        `suggestInvariant: unknown mutation kind ${JSON.stringify(exhausted)}`,
+      )
+    }
+  }
+}
diff --git a/src/diagnose/repair.ts b/src/diagnose/repair.ts
new file mode 100644
index 0000000..451a165
--- /dev/null
+++ b/src/diagnose/repair.ts
@@ -0,0 +1,200 @@
+/**
+ * Replay-validated repair — WHAT SHOULD HAVE HAPPENED?
+ *
+ * Takes the blamed steps from a `CausalResponsibilityReport`, asks a
+ * consumer-supplied `proposeFix` (LLM-backed in live use) for candidate
+ * mutations, and machine-verifies each candidate by replaying the run
+ * WITH the mutation applied (through the same `runCounterfactual` seam
+ * the sweep uses).
+ *
+ * A repair is "what should have happened" ONLY when every validation
+ * replay crosses `flipThreshold` — a prescription is never speculated,
+ * it is demonstrated. Candidates that don't flip, or whose replay
+ * errors, land in `rejected` with a typed reason; nothing is dropped
+ * silently.
+ */
+
+import {
+  type CounterfactualMutation,
+  type CounterfactualRunner,
+  runCounterfactual,
+} from '../counterfactual'
+import { ValidationError } from '../errors'
+import type { TraceStore } from '../trace/store'
+import { buildTrajectory, type Trajectory, type TrajectoryStep } from '../trajectory'
+import type { StepRef, StepResponsibility } from './causal-sweep'
+
+/** Context handed to `proposeFix` so an LLM-backed proposer can see the
+ *  full trajectory plus the responsibility evidence for the blamed step. */
+export interface RepairContext {
+  runId: string
+  trajectory: Trajectory
+  originalScore: number
+  responsibility: StepResponsibility
+}
+
+export interface PrescribeRepairOptions {
+  store: TraceStore
+  /** The failed run the sweep diagnosed. */
+  runId: string
+  /** Execution seam — same `CounterfactualRunner` contract as the sweep. */
+  runner: CounterfactualRunner
+  /** Blamed steps from `causalSweep` — typically `report.steps.slice(0, k)`. */
+  blamed: StepResponsibility[]
+  /** Candidate-fix generator. Consumer-supplied; LLM-backed in live use.
+   *  Returned mutations MUST target the blamed step's index. */
+  proposeFix: (step: TrajectoryStep, context: RepairContext) => Promise<CounterfactualMutation[]>
+  /** Score every validation replay must reach for the repair to count. Default 0.5. */
+  flipThreshold?: number
+  /** Validation replays per candidate mutation. Default 3. */
+  repsToValidate?: number
+  /** Max candidate mutations tried per step. Default: all proposed. */
+  maxAttemptsPerStep?: number
+}
+
+export interface ValidatedRepair {
+  stepRef: StepRef
+  mutation: CounterfactualMutation
+  /** Always true — presence in `repairs` IS the machine-verified claim. */
+  validated: true
+  /** Mean counterfactual score across the validation reps. */
+  meanScore: number
+  /** meanScore − originalScore. */
+  deltaScore: number
+  reps: number
+  /** Replay run ids backing the validation — audit trail. */
+  counterfactualRunIds: string[]
+}
+
+export interface RejectedRepair {
+  stepRef: StepRef
+  mutation: CounterfactualMutation
+  reason: 'did-not-flip' | 'error'
+  /** Present for 'did-not-flip': mean delta over the reps that ran. */
+  deltaScore?: number
+  /** Present for 'error': the message, preserved for diagnosis. */
+  error?: string
+}
+
+export interface RepairReport {
+  runId: string
+  originalScore: number
+  flipThreshold: number
+  repairs: ValidatedRepair[]
+  rejected: RejectedRepair[]
+  replaysUsed: number
+}
+
+export async function prescribeRepair(opts: PrescribeRepairOptions): Promise<RepairReport> {
+  const flipThreshold = opts.flipThreshold ?? 0.5
+  const repsToValidate = opts.repsToValidate ?? 3
+  if (!Number.isInteger(repsToValidate) || repsToValidate < 1) {
+    throw new ValidationError(
+      `prescribeRepair: repsToValidate must be an integer >= 1 (got ${repsToValidate})`,
+    )
+  }
+  const maxAttempts = opts.maxAttemptsPerStep ?? Number.POSITIVE_INFINITY
+  if (maxAttempts < 1) {
+    throw new ValidationError(
+      `prescribeRepair: maxAttemptsPerStep must be >= 1 (got ${opts.maxAttemptsPerStep})`,
+    )
+  }
+  if (opts.blamed.length === 0) {
+    throw new ValidationError('prescribeRepair: blamed is empty — nothing to repair')
+  }
+
+  const originalRun = await opts.store.getRun(opts.runId)
+  if (!originalRun) throw new ValidationError(`prescribeRepair: run ${opts.runId} not found`)
+  const originalScore = originalRun.outcome?.score
+  if (typeof originalScore !== 'number' || !Number.isFinite(originalScore)) {
+    throw new ValidationError(
+      `prescribeRepair: run ${opts.runId} has no numeric outcome.score — flips have no baseline`,
+    )
+  }
+
+  const trajectory = await buildTrajectory(opts.store, opts.runId)
+
+  const repairs: ValidatedRepair[] = []
+  const rejected: RejectedRepair[] = []
+  let replaysUsed = 0
+
+  for (const responsibility of opts.blamed) {
+    const step = trajectory.steps[responsibility.stepRef.index]
+    if (!step || step.span.spanId !== responsibility.stepRef.spanId) {
+      throw new ValidationError(
+        `prescribeRepair: blamed step index=${responsibility.stepRef.index} spanId=${responsibility.stepRef.spanId} does not match run ${opts.runId} — stale report?`,
+      )
+    }
+
+    const candidates = await opts.proposeFix(step, {
+      runId: opts.runId,
+      trajectory,
+      originalScore,
+      responsibility,
+    })
+    const toTry = candidates.slice(0, maxAttempts)
+
+    for (const mutation of toTry) {
+      if (mutation.at !== step.index) {
+        throw new ValidationError(
+          `prescribeRepair: proposeFix returned a mutation targeting at=${mutation.at} for blamed step index=${step.index}`,
+        )
+      }
+      const scores: number[] = []
+      const cfRunIds: string[] = []
+      let failure: string | undefined
+      for (let rep = 0; rep < repsToValidate; rep++) {
+        try {
+          const result = await runCounterfactual(opts.store, opts.runId, mutation, opts.runner)
+          replaysUsed++
+          const score = result.delta.counterfactualOutcomeScore
+          if (typeof score !== 'number' || !Number.isFinite(score)) {
+            failure = `validation rep ${rep} produced no numeric score — the runner must endRun with a numeric outcome.score`
+            break
+          }
+          scores.push(score)
+          cfRunIds.push(result.counterfactualRunId)
+        } catch (err) {
+          replaysUsed++
+          failure = err instanceof Error ? err.message : String(err)
+          break
+        }
+      }
+
+      if (failure !== undefined) {
+        rejected.push({
+          stepRef: responsibility.stepRef,
+          mutation,
+          reason: 'error',
+          error: failure,
+        })
+        continue
+      }
+
+      const meanScore = scores.reduce((a, b) => a + b, 0) / scores.length
+      const everyRepFlipped = scores.every((s) => s >= flipThreshold)
+      if (everyRepFlipped) {
+        repairs.push({
+          stepRef: responsibility.stepRef,
+          mutation,
+          validated: true,
+          meanScore,
+          deltaScore: meanScore - originalScore,
+          reps: repsToValidate,
+          counterfactualRunIds: cfRunIds,
+        })
+        // First validated repair per step IS the prescription; remaining
+        // candidates are untried, not rejected — we don't fabricate verdicts.
+        break
+      }
+      rejected.push({
+        stepRef: responsibility.stepRef,
+        mutation,
+        reason: 'did-not-flip',
+        deltaScore: meanScore - originalScore,
+      })
+    }
+  }
+
+  return { runId: opts.runId, originalScore, flipThreshold, repairs, rejected, replaysUsed }
+}
diff --git a/tests/diagnose.test.ts b/tests/diagnose.test.ts
new file mode 100644
index 0000000..14bf9c6
--- /dev/null
+++ b/tests/diagnose.test.ts
@@ -0,0 +1,490 @@
+import { describe, expect, it } from 'vitest'
+import type { CounterfactualMutation, CounterfactualRunner } from '../src/counterfactual'
+import {
+  causalSweep,
+  DIAGNOSE_ANALYST_ID,
+  describeMutation,
+  prescribeRepair,
+  suggestInvariant,
+  toAnalystFindings,
+  toCorpusRecord,
+  type ValidatedRepair,
+} from '../src/diagnose'
+import type { RunRecord } from '../src/run-record'
+import type { ToolSpan } from '../src/trace'
+import { InMemoryTraceStore, TraceEmitter } from '../src/trace'
+
+function mulberry32(seed: number): () => number {
+  let s = seed | 0
+  return () => {
+    s = (s + 0x6d2b79f5) | 0
+    let t = s
+    t = Math.imul(t ^ (t >>> 15), t | 1)
+    t ^= t + Math.imul(t ^ (t >>> 7), t | 61)
+    return ((t ^ (t >>> 14)) >>> 0) / 4294967296
+  }
+}
+
+async function seedRun(
+  store: InMemoryTraceStore,
+  outputScore: number,
+  shape: Array<{ kind: 'llm' | 'tool'; name: string; model?: string; toolName?: string }>,
+): Promise<string> {
+  const e = new TraceEmitter(store)
+  await e.startRun({ scenarioId: 's' })
+  for (const s of shape) {
+    if (s.kind === 'llm') {
+      const h = await e.span({
+        kind: 'llm',
+        name: s.name,
+        model: s.model ?? 'm',
+        messages: [],
+        output: 'x',
+      })
+      await h.end()
+    } else {
+      const h = await e.span({
+        kind: 'tool',
+        name: s.name,
+        toolName: s.toolName ?? s.name,
+        args: {},
+      })
+      await h.end({ result: 'rate=WRONG' } as Partial<ToolSpan>)
+    }
+  }
+  await e.endRun({ pass: false, score: outputScore })
+  return e.runId
+}
+
+const SHAPE = [
+  { kind: 'llm' as const, name: 'plan' },
+  { kind: 'tool' as const, name: 'fetch-rates' },
+  { kind: 'tool' as const, name: 'format' },
+  { kind: 'llm' as const, name: 'answer' },
+]
+
+/**
+ * Deterministic fake of the execution seam (same pattern as the
+ * runCounterfactual tests in tier2.test.ts): knocking out the faulty
+ * fetch-rates step (index 1) flips the run to ~0.8; every other
+ * intervention reproduces the original ~0.2 plus seeded noise.
+ */
+function makeRunner(opts: { seed: number; scoreFor?: (m: CounterfactualMutation) => number }): {
+  runner: CounterfactualRunner
+  calls: CounterfactualMutation[]
+} {
+  const rng = mulberry32(opts.seed)
+  const calls: CounterfactualMutation[] = []
+  const runner: CounterfactualRunner = {
+    async executeFrom(ctx, emitter) {
+      calls.push(ctx.mutation)
+      // Symmetric two-draw noise so per-rep deltas straddle the mean.
+      const noise = (rng() - 0.5) * 0.02 + (rng() - 0.5) * 0.02
+      const base =
+        opts.scoreFor?.(ctx.mutation) ??
+        (ctx.mutation.kind === 'swap-tool-result' && ctx.mutation.at === 1 ? 0.8 : 0.2)
+      await emitter.endRun({ pass: base >= 0.5, score: base + noise })
+    },
+  }
+  return { runner, calls }
+}
+
+describe('causalSweep', () => {
+  it('ranks the injected-fault step #1 with CI excluding zero; no-effect step CI includes zero', async () => {
+    const store = new InMemoryTraceStore()
+    const runId = await seedRun(store, 0.2, SHAPE)
+    const { runner } = makeRunner({ seed: 42 })
+
+    const report = await causalSweep({
+      store,
+      runId,
+      runner,
+      candidateSteps: [1, 2],
+      reps: 5,
+      budget: 100,
+      ciSeed: 7,
+    })
+
+    expect(report.steps).toHaveLength(2)
+    const [top, rest] = report.steps
+    expect(top!.stepRef.index).toBe(1)
+    expect(top!.stepRef.name).toBe('fetch-rates')
+    expect(top!.mutationKind).toBe('swap-tool-result')
+    expect(top!.meanEffect).toBeGreaterThan(0.5)
+    expect(top!.ciExcludesZero).toBe(true)
+    expect(top!.ci.lower).toBeGreaterThan(0)
+    expect(top!.deltas).toHaveLength(5)
+
+    expect(rest!.stepRef.index).toBe(2)
+    expect(rest!.ciExcludesZero).toBe(false)
+    expect(rest!.ci.lower).toBeLessThanOrEqual(0)
+    expect(rest!.ci.upper).toBeGreaterThanOrEqual(0)
+
+    expect(report.replaysUsed).toBe(10)
+    expect(report.uncovered).toHaveLength(0)
+    expect(report.originalScore).toBeCloseTo(0.2)
+    expect(report.byMutationKind[0]!.mutationKind).toBe('swap-tool-result')
+    expect(report.byMutationKind[0]!.n).toBe(10)
+  })
+
+  it('records counterfactual replays as meta runs parented to the original', async () => {
+    const store = new InMemoryTraceStore()
+    const runId = await seedRun(store, 0.2, SHAPE)
+    const { runner } = makeRunner({ seed: 1 })
+    const report = await causalSweep({
+      store,
+      runId,
+      runner,
+      candidateSteps: [1],
+      reps: 2,
+      budget: 10,
+    })
+    const cfRun = await store.getRun(report.steps[0]!.counterfactualRunIds[0]!)
+    expect(cfRun?.parentRunId).toBe(runId)
+    expect(cfRun?.layer).toBe('meta')
+  })
+
+  it('names uncovered steps under a tight budget instead of silently dropping them', async () => {
+    const store = new InMemoryTraceStore()
+    const runId = await seedRun(store, 0.2, SHAPE)
+    const { runner } = makeRunner({ seed: 9 })
+
+    const report = await causalSweep({
+      store,
+      runId,
+      runner,
+      candidateSteps: [1, 2],
+      reps: 4,
+      budget: 6,
+    })
+
+    expect(report.steps).toHaveLength(1)
+    expect(report.steps[0]!.stepRef.index).toBe(1)
+    expect(report.replaysUsed).toBe(4)
+    expect(report.uncovered).toHaveLength(1)
+    expect(report.uncovered[0]!.index).toBe(2)
+    expect(report.uncovered[0]!.name).toBe('format')
+  })
+
+  it('covers nothing when budget < reps — everything uncovered, zero replays', async () => {
+    const store = new InMemoryTraceStore()
+    const runId = await seedRun(store, 0.2, SHAPE)
+    const { runner, calls } = makeRunner({ seed: 9 })
+    const report = await causalSweep({
+      store,
+      runId,
+      runner,
+      candidateSteps: [1, 2],
+      reps: 4,
+      budget: 3,
+    })
+    expect(report.steps).toHaveLength(0)
+    expect(report.replaysUsed).toBe(0)
+    expect(calls).toHaveLength(0)
+    expect(report.uncovered.map((s) => s.index)).toEqual([1, 2])
+  })
+
+  it('defaults candidate steps to llm + tool spans with the payload-free probe kinds', async () => {
+    const store = new InMemoryTraceStore()
+    const runId = await seedRun(store, 0.2, SHAPE)
+    const { runner, calls } = makeRunner({ seed: 3 })
+    await causalSweep({ store, runId, runner, reps: 2, budget: 100 })
+    // 4 steps × 1 default mutation × 2 reps
+    expect(calls).toHaveLength(8)
+    const kinds = new Set(calls.map((c) => c.kind))
+    expect(kinds).toEqual(new Set(['truncate-after', 'swap-tool-result']))
+  })
+
+  it('rejects reps < 2 — a single intervention delta is noise, not measurement', async () => {
+    const store = new InMemoryTraceStore()
+    const runId = await seedRun(store, 0.2, SHAPE)
+    const { runner } = makeRunner({ seed: 3 })
+    await expect(causalSweep({ store, runId, runner, reps: 1, budget: 10 })).rejects.toThrow(
+      /reps must be an integer >= 2/,
+    )
+  })
+
+  it('fails loud when the original run has no numeric score', async () => {
+    const store = new InMemoryTraceStore()
+    const e = new TraceEmitter(store)
+    await e.startRun({ scenarioId: 's' })
+    await e.endRun({ pass: false })
+    const { runner } = makeRunner({ seed: 3 })
+    await expect(
+      causalSweep({ store, runId: e.runId, runner, reps: 2, budget: 10 }),
+    ).rejects.toThrow(/no numeric outcome\.score/)
+  })
+
+  it('fails loud when a replay omits the score instead of recording a bogus delta', async () => {
+    const store = new InMemoryTraceStore()
+    const runId = await seedRun(store, 0.2, SHAPE)
+    const runner: CounterfactualRunner = {
+      async executeFrom(_ctx, emitter) {
+        await emitter.endRun({ pass: true })
+      },
+    }
+    await expect(
+      causalSweep({ store, runId, runner, candidateSteps: [1], reps: 2, budget: 10 }),
+    ).rejects.toThrow(/runner must endRun with a numeric outcome\.score/)
+  })
+})
+
+describe('prescribeRepair', () => {
+  async function diagnosedSetup() {
+    const store = new InMemoryTraceStore()
+    const runId = await seedRun(store, 0.2, SHAPE)
+    const { runner } = makeRunner({ seed: 42 })
+    const report = await causalSweep({
+      store,
+      runId,
+      runner,
+      candidateSteps: [1, 2],
+      reps: 5,
+      budget: 100,
+      ciSeed: 7,
+    })
+    return { store, runId, report }
+  }
+
+  const goodFix: CounterfactualMutation = {
+    kind: 'swap-tool-result',
+    at: 1,
+    newResult: { rate: 4.5 },
+  }
+  const badFix: CounterfactualMutation = { kind: 'swap-tool-result', at: 1, newResult: 'garbage' }
+
+  it('emits only flipping mutations; non-flippers land in rejected with reason', async () => {
+    const { store, runId, report } = await diagnosedSetup()
+    const { runner } = makeRunner({
+      seed: 11,
+      scoreFor: (m) =>
+        m.kind === 'swap-tool-result' &&
+        JSON.stringify(m.newResult) === JSON.stringify(goodFix.newResult)
+          ? 0.9
+          : 0.3,
+    })
+
+    const repair = await prescribeRepair({
+      store,
+      runId,
+      runner,
+      blamed: report.steps.slice(0, 1),
+      proposeFix: async () => [badFix, goodFix],
+      flipThreshold: 0.5,
+      repsToValidate: 3,
+    })
+
+    expect(repair.repairs).toHaveLength(1)
+    const validated = repair.repairs[0]!
+    expect(validated.validated).toBe(true)
+    expect(validated.mutation).toEqual(goodFix)
+    expect(validated.stepRef.index).toBe(1)
+    expect(validated.meanScore).toBeGreaterThanOrEqual(0.5)
+    expect(validated.deltaScore).toBeCloseTo(validated.meanScore - 0.2, 10)
+    expect(validated.reps).toBe(3)
+    expect(validated.counterfactualRunIds).toHaveLength(3)
+
+    expect(repair.rejected).toHaveLength(1)
+    expect(repair.rejected[0]!.reason).toBe('did-not-flip')
+    expect(repair.rejected[0]!.mutation).toEqual(badFix)
+    expect(repair.rejected[0]!.deltaScore).toBeCloseTo(0.1, 1)
+    expect(repair.replaysUsed).toBe(6)
+  })
+
+  it('a repair must flip on EVERY validation rep, not on average', async () => {
+    const { store, runId, report } = await diagnosedSetup()
+    // Scores alternate 0.9 / 0.4: mean 0.65 crosses the threshold but rep 2 does not.
+    let call = 0
+    const runner: CounterfactualRunner = {
+      async executeFrom(_ctx, emitter) {
+        call++
+        await emitter.endRun({ pass: true, score: call % 2 === 1 ? 0.9 : 0.4 })
+      },
+    }
+    const repair = await prescribeRepair({
+      store,
+      runId,
+      runner,
+      blamed: report.steps.slice(0, 1),
+      proposeFix: async () => [goodFix],
+      repsToValidate: 3,
+    })
+    expect(repair.repairs).toHaveLength(0)
+    expect(repair.rejected[0]!.reason).toBe('did-not-flip')
+  })
+
+  it('replay errors become typed rejections, never silent drops', async () => {
+    const { store, runId, report } = await diagnosedSetup()
+    const { runner } = makeRunner({ seed: 5 })
+    const explosive: CounterfactualMutation = {
+      kind: 'custom',
+      at: 1,
+      describe: 'patch the parser',
+      apply: () => {
+        throw new Error('boom: parser patch unapplicable')
+      },
+    }
+    const repair = await prescribeRepair({
+      store,
+      runId,
+      runner,
+      blamed: report.steps.slice(0, 1),
+      proposeFix: async () => [explosive],
+    })
+    expect(repair.repairs).toHaveLength(0)
+    expect(repair.rejected).toHaveLength(1)
+    expect(repair.rejected[0]!.reason).toBe('error')
+    expect(repair.rejected[0]!.error).toMatch(/boom/)
+  })
+
+  it('respects maxAttemptsPerStep', async () => {
+    const { store, runId, report } = await diagnosedSetup()
+    const { runner } = makeRunner({ seed: 5, scoreFor: () => 0.3 })
+    const repair = await prescribeRepair({
+      store,
+      runId,
+      runner,
+      blamed: report.steps.slice(0, 1),
+      proposeFix: async () => [badFix, goodFix],
+      maxAttemptsPerStep: 1,
+    })
+    expect(repair.repairs).toHaveLength(0)
+    expect(repair.rejected).toHaveLength(1)
+    expect(repair.rejected[0]!.mutation).toEqual(badFix)
+  })
+
+  it('rejects a stale report whose stepRef does not match the run', async () => {
+    const { store, runId, report } = await diagnosedSetup()
+    const { runner } = makeRunner({ seed: 5 })
+    const stale = { ...report.steps[0]!, stepRef: { ...report.steps[0]!.stepRef, spanId: 'nope' } }
+    await expect(
+      prescribeRepair({
+        store,
+        runId,
+        runner,
+        blamed: [stale],
+        proposeFix: async () => [goodFix],
+      }),
+    ).rejects.toThrow(/does not match run/)
+  })
+})
+
+describe('remediation adapters', () => {
+  async function fullChain() {
+    const store = new InMemoryTraceStore()
+    const runId = await seedRun(store, 0.2, SHAPE)
+    const sweep = makeRunner({ seed: 42 })
+    const report = await causalSweep({
+      store,
+      runId,
+      runner: sweep.runner,
+      candidateSteps: [1, 2],
+      reps: 5,
+      budget: 100,
+      ciSeed: 7,
+    })
+    const fix: CounterfactualMutation = {
+      kind: 'swap-tool-result',
+      at: 1,
+      newResult: { rate: 4.5 },
+    }
+    const validate = makeRunner({ seed: 11, scoreFor: () => 0.9 })
+    const repairs = await prescribeRepair({
+      store,
+      runId,
+      runner: validate.runner,
+      blamed: report.steps.slice(0, 1),
+      proposeFix: async () => [fix],
+    })
+    return { report, repairs }
+  }
+
+  it('toAnalystFindings emits schema-valid findings with effect-scaled severity', async () => {
+    const { report, repairs } = await fullChain()
+    const findings = toAnalystFindings(report, repairs)
+    expect(findings).toHaveLength(2)
+
+    for (const f of findings) {
+      expect(f.schema_version).toBe('1.0.0')
+      expect(f.finding_id).toMatch(/^f_[0-9a-f]{20}$/)
+      expect(f.analyst_id).toBe(DIAGNOSE_ANALYST_ID)
+      expect(f.area).toBe('causal-attribution')
+      expect(f.evidence_refs.length).toBeGreaterThanOrEqual(2)
+      expect(f.derived_from_judge).toBeUndefined()
+    }
+
+    const blamed = findings.find((f) => f.subject === report.steps[0]!.stepRef.spanId)!
+    expect(blamed.severity).toBe('critical')
+    expect(blamed.confidence).toBe(0.95)
+    expect(blamed.recommended_action).toBe(describeMutation(repairs.repairs[0]!.mutation))
+    expect(blamed.validation_plan).toMatch(/replay-validated: 3\/3 reps scored >= 0\.5/)
+    expect(blamed.evidence_refs[0]!.uri).toBe(`span://${report.steps[0]!.stepRef.spanId}`)
+    expect(blamed.evidence_refs[1]!.excerpt).toContain('deltas=[')
+
+    const noise = findings.find((f) => f.subject === report.steps[1]!.stepRef.spanId)!
+    expect(noise.severity).toBe('info')
+    expect(noise.confidence).toBe(0.3)
+    expect(noise.recommended_action).toBeUndefined()
+  })
+
+  it('toCorpusRecord pins the failure as a fresh, schema-valid corpus scenario', async () => {
+    const { repairs } = await fullChain()
+    const original: RunRecord = {
+      runId: 'run-original',
+      experimentId: 'exp-1',
+      candidateId: 'cand-1',
+      seed: 42,
+      model: 'test-model@2026-01-01',
+      promptHash: 'p'.repeat(8),
+      configHash: 'c'.repeat(8),
+      commitSha: 'deadbeef',
+      wallMs: 1200,
+      costUsd: 0.01,
+      tokenUsage: { input: 100, output: 50 },
+      outcome: { searchScore: 0.2, raw: {} },
+      splitTag: 'search',
+    }
+    const repair = repairs.repairs[0]!
+    const pinned = toCorpusRecord(original, repair, { prompt: 'fetch the current rates' })
+
+    expect(pinned.runId).toBe(`run-original#repair:${repair.stepRef.spanId}`)
+    expect(pinned.runId).not.toBe(original.runId)
+    expect(pinned.prompt).toBe('fetch the current rates')
+    expect(pinned.completion).toBe(describeMutation(repair.mutation))
+    expect(pinned.outcome.raw.diagnose_blamed_step_index).toBe(1)
+    expect(pinned.outcome.raw.diagnose_repair_mean_score).toBeCloseTo(repair.meanScore, 10)
+    expect(pinned.outcome.raw.diagnose_repair_delta_score).toBeCloseTo(repair.deltaScore, 10)
+    // Original record untouched.
+    expect(original.outcome.raw.diagnose_blamed_step_index).toBeUndefined()
+  })
+
+  it('suggestInvariant derives never/without clauses per mutation kind', async () => {
+    const { repairs } = await fullChain()
+    const toolHint = suggestInvariant(repairs.repairs[0]!)
+    expect(toolHint.description).toContain('fetch-rates')
+    expect(toolHint.never).toContain("tool 'fetch-rates'")
+    expect(toolHint.without).toContain("tool 'fetch-rates'")
+
+    const base = repairs.repairs[0]!
+    const truncate: ValidatedRepair = {
+      ...base,
+      mutation: { kind: 'truncate-after', at: 1 },
+    }
+    const truncateHint = suggestInvariant(truncate)
+    expect(truncateHint.never).toContain('after')
+    expect(truncateHint.without).toBeUndefined()
+
+    const inject: ValidatedRepair = {
+      ...base,
+      mutation: { kind: 'inject-system-message', at: 1, content: 'always validate rates' },
+    }
+    const injectHint = suggestInvariant(inject)
+    expect(injectHint.without).toContain('always validate rates')
+
+    const swapModel: ValidatedRepair = {
+      ...base,
+      mutation: { kind: 'swap-model', at: 1, newModel: 'better-model@2026-01-01' },
+    }
+    expect(suggestInvariant(swapModel).never).toContain('better-model@2026-01-01')
+  })
+})
diff --git a/tsup.config.ts b/tsup.config.ts
index 2e69b76..55aac36 100644
--- a/tsup.config.ts
+++ b/tsup.config.ts
@@ -7,6 +7,7 @@ export default defineConfig({
     control: 'src/control.ts',
     reporting: 'src/reporting.ts',
     rl: 'src/rl/index.ts',
+    diagnose: 'src/diagnose/index.ts',
     traces: 'src/traces.ts',
     'telemetry/index': 'src/telemetry/index.ts',
     'telemetry/file': 'src/telemetry/sink-file.ts',