getsentry · dcramer · May 4, 2026 · May 3, 2026
diff --git a/packages/junior-evals/README.md b/packages/junior-evals/README.md
@@ -4,9 +4,9 @@
 
 Evals are end-to-end Slack conversation evaluations. They are the integration-style test layer for agent-facing behavior when model interpretation is part of the contract.
 
-- We define conversation cases inline in TypeScript using `slackEval()`.
+- We define conversation cases inline in TypeScript using `describeEval()` and the shared `slackEvals` harness options.
 - We run the real runtime/harness against those fixtures.
-- We score outcomes with an LLM judge via `vitest-evals`.
+- We score outcomes with a `vitest-evals` judge that reuses the Slack harness prompt seam, backed by Junior's Pi client and the Vercel AI Gateway model `openai/gpt-5.4`.
 
 ## Layer Boundaries
 
@@ -52,7 +52,7 @@ Not in scope:
 
 ## Execution Model
 
-For each case (`slackEval()` call):
+For each `it()` case inside a `describeEval()` suite:
 
 1. Replay events through the harness via `runEvalScenario()`.
 2. Create a fresh runtime instance for the case via the chat composition root; do not mutate the production singleton runtime.
@@ -97,7 +97,7 @@ Evals require real Vercel Sandbox access. If sandbox bootstrap fails, the eval f
 
 ## Authoring Rules
 
-- Add core cases under `evals/core/*.eval.ts` and plugin-specific cases under `evals/<plugin>/` using `slackEval()`.
+- Add core cases under `evals/core/*.eval.ts` and plugin-specific cases under `evals/<plugin>/` using `describeEval()` with `slackEvals`.
 - Use event builders (`mention`, `threadMessage`, `threadStart`) from `evals/helpers.ts`.
 - Use `auto_complete_mcp_oauth` or `auto_complete_oauth` when the harness should instantly complete the fake provider callback after our app has genuinely initiated auth.
 - For multi-turn, pass the same `thread` override so events land in one thread.
@@ -109,7 +109,7 @@ Evals require real Vercel Sandbox access. If sandbox bootstrap fails, the eval f
 - `allow` should list acceptable optional variations.
 - `fail` should list forbidden outputs or failure conditions.
 - Do not write judge criteria as one dense paragraph.
-- Let the `describe()` block own the behavior area. The file path and `describe()` context already provide scope.
+- Let the `describeEval()` block own the behavior area. The file path and `describeEval()` context already provide scope.
 - Each eval name should only state the specific scenario and outcome.
 - Prefer `when <trigger>, <outcome>` over vague labels like `continuity: remembers prior turn context`.
 - Keep user prompts natural. They should read like plausible user requests, not scripted implementation instructions.
@@ -159,13 +159,18 @@ Avoid:
 ## Minimal Case
 
 ```typescript
-import { mention, rubric, slackEval } from "../helpers";
-
-slackEval("when explicitly mentioned, post one direct reply", {
-  events: [mention("<@U_APP> summarize this")],
-  criteria: rubric({
-    contract: "An explicit mention gets one direct reply.",
-    pass: ["The assistant posts exactly one reply to the mention."],
-  }),
+import { describeEval } from "vitest-evals";
+import { mention, rubric, slackEvals } from "../helpers";
+
+describeEval("Routing", slackEvals, (it) => {
+  it("when explicitly mentioned, post one direct reply", async ({ run }) => {
+    await run({
+      events: [mention("<@U_APP> summarize this")],
+      criteria: rubric({
+        contract: "An explicit mention gets one direct reply.",
+        pass: ["The assistant posts exactly one reply to the mention."],
+      }),
+    });
+  });
 });
 ```
diff --git a/packages/junior-evals/evals/core/lifecycle-and-resilience.eval.ts b/packages/junior-evals/evals/core/lifecycle-and-resilience.eval.ts
@@ -1,10 +1,11 @@
-import { describe } from "vitest";
-import { mention, rubric, slackEval, threadStart } from "../helpers";
+import { describeEval } from "vitest-evals";
+import { mention, rubric, slackEvals, threadStart } from "../helpers";
 
-describe("Lifecycle and Resilience", () => {
-  slackEval(
-    "when an assistant thread starts, set title and prompts without posting a reply",
-    {
+describeEval("Lifecycle and Resilience", slackEvals, (it) => {
+  it("when an assistant thread starts, set title and prompts without posting a reply", async ({
+    run,
+  }) => {
+    await run({
       events: [threadStart()],
       criteria: rubric({
         contract:
@@ -15,12 +16,13 @@ describe("Lifecycle and Resilience", () => {
           "Suggested prompts are set exactly once.",
         ],
       }),
-    },
-  );
+    });
+  });
 
-  slackEval(
-    "when reply generation fails before any answer, post one clear error reply",
-    {
+  it("when reply generation fails before any answer, post one clear error reply", async ({
+    run,
+  }) => {
+    await run({
       overrides: { fail_reply_call: 1 },
       events: [mention("What's the status of the deploy?")],
       criteria: rubric({
@@ -34,12 +36,13 @@ describe("Lifecycle and Resilience", () => {
           "Do not leak stack traces, exception text, or debugging narration in the reply.",
         ],
       }),
-    },
-  );
+    });
+  });
 
-  slackEval(
-    "when a short reply is interrupted by the provider, keep the partial answer in one marked post",
-    {
+  it("when a short reply is interrupted by the provider, keep the partial answer in one marked post", async ({
+    run,
+  }) => {
+    await run({
       overrides: {
         reply_results: [
           {
@@ -63,6 +66,6 @@ describe("Lifecycle and Resilience", () => {
           "Do not mention provider internals, execution failure details, or logged-for-debugging text.",
         ],
       }),
-    },
-  );
+    });
+  });
 });
diff --git a/packages/junior-evals/evals/core/media-and-attachments.eval.ts b/packages/junior-evals/evals/core/media-and-attachments.eval.ts
@@ -1,10 +1,11 @@
-import { describe } from "vitest";
-import { mention, rubric, slackEval } from "../helpers";
+import { describeEval } from "vitest-evals";
+import { mention, rubric, slackEvals } from "../helpers";
 
-describe("Media and Attachments", () => {
-  slackEval(
-    "when the user asks for an image, attach an image instead of replying with text alone",
-    {
+describeEval("Media and Attachments", slackEvals, (it) => {
+  it("when the user asks for an image, attach an image instead of replying with text alone", async ({
+    run,
+  }) => {
+    await run({
       overrides: { mock_image_generation: true },
       events: [mention("show me how you feel")],
       criteria: rubric({
@@ -17,6 +18,6 @@ describe("Media and Attachments", () => {
           "Do not include sandbox setup failure text.",
         ],
       }),
-    },
-  );
+    });
+  });
 });
diff --git a/packages/junior-evals/evals/core/oauth-workflows.eval.ts b/packages/junior-evals/evals/core/oauth-workflows.eval.ts
@@ -1,16 +1,17 @@
-import { describe } from "vitest";
-import { rubric, slackEval, threadMessage } from "../helpers";
+import { describeEval } from "vitest-evals";
+import { rubric, slackEvals, threadMessage } from "../helpers";
 
-describe("OAuth Workflows", () => {
+describeEval("OAuth Workflows", slackEvals, (it) => {
   const mcpAuthResumeThread = {
     id: "thread-auth-resume",
     channel_id: "C-auth-resume",
     thread_ts: "17000000.auth-resume",
   };
 
-  slackEval(
-    "when MCP auth pauses a turn, resume in the same thread with prior context intact",
-    {
+  it("when MCP auth pauses a turn, resume in the same thread with prior context intact", async ({
+    run,
+  }) => {
+    await run({
       overrides: {
         auto_complete_mcp_oauth: ["eval-auth"],
         plugin_dirs: ["evals/fixtures/plugins"],
@@ -29,7 +30,6 @@ describe("OAuth Workflows", () => {
         ),
       ],
       taskTimeout: 120_000,
-      timeout: 300_000,
       criteria: rubric({
         contract:
           "After MCP authorization completes, the same thread gets a resumed answer that keeps prior context.",
@@ -49,18 +49,19 @@ describe("OAuth Workflows", () => {
           "Do not post a generic failure message.",
         ],
       }),
-    },
-  );
+    });
+  });
 
   const oauthResumeThread = {
     id: "thread-oauth-resume",
     channel_id: "C-oauth-resume",
     thread_ts: "17000000.oauth-resume",
   };
 
-  slackEval(
-    "when generic OAuth pauses a turn, resume in the same thread with prior context intact",
-    {
+  it("when generic OAuth pauses a turn, resume in the same thread with prior context intact", async ({
+    run,
+  }) => {
+    await run({
       overrides: {
         auto_complete_oauth: ["eval-oauth"],
         plugin_dirs: ["evals/fixtures/plugins"],
@@ -79,7 +80,6 @@ describe("OAuth Workflows", () => {
         ),
       ],
       taskTimeout: 120_000,
-      timeout: 300_000,
       criteria: rubric({
         contract:
           "After generic OAuth authorization completes, the same thread gets a resumed answer that keeps prior context.",
@@ -97,18 +97,19 @@ describe("OAuth Workflows", () => {
           "Do not post a generic failure message.",
         ],
       }),
-    },
-  );
+    });
+  });
 
   const oauthReconnectThread = {
     id: "thread-oauth-reconnect",
     channel_id: "C-oauth-reconnect",
     thread_ts: "17000000.oauth-reconnect",
   };
 
-  slackEval(
-    "when the user explicitly asks to reconnect, confirm reconnection without auto-resuming another task",
-    {
+  it("when the user explicitly asks to reconnect, confirm reconnection without auto-resuming another task", async ({
+    run,
+  }) => {
+    await run({
       overrides: {
         auto_complete_oauth: ["eval-oauth"],
         plugin_dirs: ["evals/fixtures/plugins"],
@@ -120,7 +121,6 @@ describe("OAuth Workflows", () => {
         ),
       ],
       taskTimeout: 120_000,
-      timeout: 300_000,
       criteria: rubric({
         contract:
           "An explicit reconnect request can drive a fresh authorization cycle to completion in the same thread.",
@@ -137,6 +137,6 @@ describe("OAuth Workflows", () => {
           "Do not post a generic failure message.",
         ],
       }),
-    },
-  );
+    });
+  });
 });
diff --git a/packages/junior-evals/evals/core/output-contract.eval.ts b/packages/junior-evals/evals/core/output-contract.eval.ts
@@ -1,10 +1,11 @@
-import { describe } from "vitest";
-import { mention, rubric, slackEval } from "../helpers";
+import { describeEval } from "vitest-evals";
+import { mention, rubric, slackEvals } from "../helpers";
 
-describe("Output Contract", () => {
-  slackEval(
-    "when asked for a structured overview, use bolded section labels instead of markdown headings",
-    {
+describeEval("Output Contract", slackEvals, (it) => {
+  it("when asked for a structured overview, avoid hash markdown headings", async ({
+    run,
+  }) => {
+    await run({
       events: [
         mention(
           "Give me a short overview of how OAuth 2.0 authorization code flow works. Cover the authorization request, token exchange, and refresh. Keep it to a few short sections.",
@@ -13,22 +14,26 @@ describe("Output Contract", () => {
       requireSandboxReady: false,
       criteria: rubric({
         contract:
-          "Structured multi-section replies use Slack-friendly bolded section labels, not markdown heading syntax.",
+          "Structured multi-section replies do not use hash-prefixed markdown heading markers.",
         pass: [
           "The assistant posts one reply that covers the authorization request, token exchange, and refresh.",
-          "Section labels appear as bolded short phrases on their own line, not as markdown headings.",
+          "No section label line starts with `#`, `##`, or `###`.",
+        ],
+        allow: [
+          "Bolded title lines, bolded section labels, and numbered bold labels are acceptable.",
         ],
         fail: [
-          "Do not use markdown heading syntax (lines beginning with `#`, `##`, or `###`) for section labels.",
-          "Do not paste a heading line like `# Authorization Request` at the start of a section.",
+          "Do not use lines beginning with `#`, `##`, or `###` for section labels.",
+          "Do not paste a hash-heading line like `# Authorization Request` at the start of a section.",
         ],
       }),
-    },
-  );
+    });
+  });
 
-  slackEval(
-    "when the reply contains multiple URLs, use plain URLs instead of markdown link syntax",
-    {
+  it("when the reply contains multiple URLs, use plain URLs instead of markdown link syntax", async ({
+    run,
+  }) => {
+    await run({
       events: [
         mention(
           "Where can I find the official documentation for the Slack Web API, Slack Bolt JS, and Slack Block Kit? Just point me at the three canonical starting pages.",
@@ -47,12 +52,13 @@ describe("Output Contract", () => {
           "Do not wrap URLs in Slack `<url|label>` link syntax unless the user explicitly asked for that form.",
         ],
       }),
-    },
-  );
+    });
+  });
 
-  slackEval(
-    "when asked to compare two options, use bullets instead of a markdown table",
-    {
+  it("when asked to compare two options, use bullets instead of a markdown table", async ({
+    run,
+  }) => {
+    await run({
       events: [
         mention(
           "Give me a short comparison of REST and GraphQL across these three dimensions: caching, over-fetching, and tooling maturity. Keep it tight.",
@@ -71,6 +77,6 @@ describe("Output Contract", () => {
           "Do not include a row like `| REST | GraphQL |` or similar pipe-delimited structures.",
         ],
       }),
-    },
-  );
+    });
+  });
 });