From 34f9e39f0b6f9a0932dc9969bc154eb65f2f282c Mon Sep 17 00:00:00 2001 From: Danh Doan Date: Tue, 21 Apr 2026 20:54:41 +0700 Subject: [PATCH 1/2] feat: [ENG-2243] AutoHarness V2 dual-VM isolation integration test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3 Task 3.5 — closes brutal-review item A3 with 5 integration tests covering every cross-VM attack vector. Each test exercises the full pipeline (real `HarnessStore` + real `HarnessModuleBuilder` + real `SandboxService.loadHarness`) against a harness fixture designed to breach the sandbox boundary; the test asserts the breach is contained and the pipeline stays healthy. 1. **Global pollution** — harness `globalThis.X = 'touched'` stays in the VM's private `globalThis`; Node's `globalThis[X]` remains `undefined` (V8 per-context realm). 2. **Closure leak** — harness returns a closure that captures `ctx.tools` and tries `stolen.curate = hijackFn` when called. Deep-freeze at the invocation boundary (ENG-2239) + strict-mode injection means the mutation throws a VM-realm `TypeError` with message matching `/read.?only/i`. 3. **Mutable parameter** — harness writes `ctx.env.commandType = 'hacked'`. Deep-freeze throws; wrapper normalizes as `curate() failed: ...`. 4. **Prototype pollution** — harness `Object.prototype.X = 'polluted'` stays in the VM realm's prototype; Node's `({} as any).X` remains `undefined`. 5. **Stack-trace escape** — harness throws error with attached `capturedThis: globalThis` AND `capturedArbitrary: {secret: ...}`. Wrapper constructs a fresh `new Error('curate() failed: ' + msg)` — only `message` survives, all attached properties drop. **Cross-realm `instanceof` (Attack 2)**: TypeErrors from `vm.createContext` are instances of the VM realm's TypeError class, not Node's outer realm. `error instanceof TypeError` fails despite the error being structurally a TypeError. Fix: check `error.name === 'TypeError'` (string-valued, realm-agnostic) + `error.message` match. **`capturedArbitrary` property (Attack 5)**: attack attaches two properties to the thrown error — a known one (`capturedThis`) and an arbitrary one (`capturedArbitrary`). Proves the wrapper strips *all* attached properties by constructing a fresh Error, not just the ones it explicitly knows about. Per-attack `expectSandboxHealthy()` now asserts BOTH `2 + 2 === 4` (JS intact) AND `typeof tools === 'object' && typeof tools.readFile === 'function'` (tools namespace wired). A single sandbox snapshot proves "attack didn't destabilize execution OR tool injection." Uses `FileKeyStorage({inMemory: true})` instead of the task doc's suggested `mkdtemp` + disk. Attacks don't need real I/O; in-memory storage means zero disk footprint, zero tmpdir collision surface, zero cleanup logic. Full suite: 6710 passing / 0 failing; 59ms for this file's 5 tests. --- .../agent/harness/isolation.test.ts | 341 ++++++++++++++++++ 1 file changed, 341 insertions(+) create mode 100644 test/integration/agent/harness/isolation.test.ts diff --git a/test/integration/agent/harness/isolation.test.ts b/test/integration/agent/harness/isolation.test.ts new file mode 100644 index 000000000..b08241403 --- /dev/null +++ b/test/integration/agent/harness/isolation.test.ts @@ -0,0 +1,341 @@ +/** + * AutoHarness V2 — Phase 3 Task 3.5 dual-VM isolation integration test. + * + * Closes brutal-review item A3: the five enumerated attack vectors + * that MUST NOT cross between a harness's `vm.createContext` and the + * outer Node process. These tests are Phase 3's security proof of + * work — if any scenario regresses, the harness sandbox is + * structurally compromised. + * + * Each attack is exercised end-to-end against the real pipeline: + * real `HarnessStore` (in-memory-backed `FileKeyStorage`), real + * `HarnessModuleBuilder`, real `SandboxService.loadHarness`. The + * test runs `result.module.curate(ctx)` so the wrapper's deep-freeze + * + strict-mode + VM timeout apply at every invocation boundary. + * + * Invariants per attack: + * + * 1. The attempted leak does NOT reach the outer Node scope (or + * the test's object-literal prototype, as appropriate). + * 2. The loader pipeline stays healthy after the attack — a + * subsequent `expectSandboxHealthy()` confirms no state + * corruption. + */ + +import {expect} from 'chai' +import {createSandbox, type SinonSandbox} from 'sinon' + +import type { + HarnessContext, + HarnessVersion, +} from '../../../../src/agent/core/domain/harness/types.js' +import type {IFileSystem} from '../../../../src/agent/core/interfaces/i-file-system.js' +import type {ValidatedHarnessConfig} from '../../../../src/agent/infra/agent/agent-schemas.js' + +import {NoOpLogger} from '../../../../src/agent/core/interfaces/i-logger.js' +import {HarnessModuleBuilder} from '../../../../src/agent/infra/harness/harness-module-builder.js' +import {HarnessStore} from '../../../../src/agent/infra/harness/harness-store.js' +import {SandboxService} from '../../../../src/agent/infra/sandbox/sandbox-service.js' +import {FileKeyStorage} from '../../../../src/agent/infra/storage/file-key-storage.js' + +const VALID_META = ` + exports.meta = function meta() { + return { + capabilities: ['curate'], + commandType: 'curate', + projectPatterns: [], + version: 1, + } + } +` + +// Unique sentinels per attack so cross-test contamination (if any +// security failure leaks) surfaces in the right test, not a random one. +const ATTACK_1_GLOBAL_KEY = '__HARNESS_ISOLATION_TEST_1_LEAK__' +const ATTACK_4_PROTO_KEY = '__harnessIsolationTest4Pollution__' + +function makeVersion(code: string): HarnessVersion { + return { + code, + commandType: 'curate', + createdAt: 1_700_000_000_000, + heuristic: 0.3, + id: 'v-1', + metadata: { + capabilities: ['curate'], + commandType: 'curate', + projectPatterns: [], + version: 1, + }, + projectId: 'p', + projectType: 'typescript', + version: 1, + } +} + +function makeCtx(): HarnessContext { + return { + abort: new AbortController().signal, + env: {commandType: 'curate', projectType: 'typescript', workingDirectory: '/'}, + tools: { + // Stubs — attacks 2 and 3 reference `ctx.tools` as a capture + // target, but no attack actually calls a real tool. A future + // attack that exercises a live tool call should replace these + // with session-bound fixtures. + curate: (async () => ({})) as unknown as HarnessContext['tools']['curate'], + readFile: (async () => ({})) as unknown as HarnessContext['tools']['readFile'], + }, + } +} + +function makeEnabledConfig(): ValidatedHarnessConfig { + return {autoLearn: true, enabled: true, language: 'typescript', maxVersions: 20} +} + +function makeStubFileSystem(sb: SinonSandbox): IFileSystem { + // Matches the established sandbox-test stub pattern. None of the + // isolation attacks invoke file-system methods through the sandbox. + return { + editFile: sb.stub().resolves({bytesWritten: 0, path: '/'}), + globFiles: sb.stub().resolves({files: [], totalFound: 0, truncated: false}), + initialize: sb.stub(), + listDirectory: sb.stub().resolves({files: [], tree: '', truncated: false}), + readFile: sb.stub().resolves({content: '', exists: true, path: '/'}), + searchContent: sb.stub().resolves({matches: [], totalMatches: 0, truncated: false}), + writeFile: sb.stub().resolves({bytesWritten: 0, path: '/'}), + } as unknown as IFileSystem +} + +describe('dual-VM isolation — brutal-review A3', () => { + let sb: SinonSandbox + let service: SandboxService + let store: HarnessStore + + beforeEach(async () => { + sb = createSandbox() + const keyStorage = new FileKeyStorage({inMemory: true}) + await keyStorage.initialize() + store = new HarnessStore(keyStorage, new NoOpLogger()) + const builder = new HarnessModuleBuilder(new NoOpLogger()) + + service = new SandboxService() + service.setHarnessConfig(makeEnabledConfig()) + service.setHarnessStore(store) + service.setHarnessModuleBuilder(builder) + service.setFileSystem(makeStubFileSystem(sb)) + }) + + afterEach(() => { + sb.restore() + // Defensive cleanup — if a security failure somehow let the attack + // succeed, we don't want the pollution bleeding into other tests + // (or the broader test suite). + delete (globalThis as Record)[ATTACK_1_GLOBAL_KEY] + delete (Object.prototype as Record)[ATTACK_4_PROTO_KEY] + }) + + async function seedAndLoad(code: string) { + await store.saveVersion(makeVersion(code)) + return service.loadHarness('s1', 'p', 'curate') + } + + async function expectSandboxHealthy(): Promise { + // Prove two things in one check: JS evaluation is intact AND the + // tools namespace is still wired. The AC for A3 says "subsequent + // raw tools.* calls still work" — this snapshot captures both + // properties without needing async-Promise gymnastics inside + // the sandbox. + const result = await service.executeCode( + `({math: 2 + 2, hasTools: typeof tools === 'object' && typeof tools.readFile === 'function'})`, + 's1', + ) + expect(result.returnValue).to.deep.equal( + {hasTools: true, math: 4}, + 'sandbox must stay healthy after attack: JS + tools namespace intact', + ) + } + + // ── Attack 1: Global pollution ──────────────────────────────────────────── + + it('1. Global pollution: harness `globalThis.X = ...` does NOT reach Node globalThis', async () => { + const code = `${VALID_META} + exports.curate = async function curate(ctx) { + globalThis.${ATTACK_1_GLOBAL_KEY} = 'touched' + return 'ok' + } + ` + const result = await seedAndLoad(code) + expect(result.loaded).to.equal(true) + if (!result.loaded) return // unreachable: Chai assertion above throws first — TypeScript narrowing only + if (result.module.curate === undefined) throw new Error('fixture must export curate') + + await result.module.curate(makeCtx()) + + // The harness set `globalThis.` inside its own VM context. + // V8's `vm.createContext({})` gives each context its own + // globalThis, so Node's globalThis must remain unmodified. + expect((globalThis as Record)[ATTACK_1_GLOBAL_KEY]).to.equal( + undefined, + 'harness global-pollution must not reach Node globalThis', + ) + + await expectSandboxHealthy() + }) + + // ── Attack 2: Closure leak ──────────────────────────────────────────────── + + it('2. Closure leak: returned closure cannot mutate captured ctx.tools', async () => { + // The attack: harness returns a closure that closes over + // `ctx.tools` and attempts to rebind `stolen.curate = hijackFn`. + // Deep-freeze at the invocation boundary (HarnessModuleBuilder + // wrapper) means `stolen` is a frozen object — the mutation + // throws in strict mode. + const code = `${VALID_META} + exports.curate = async function curate(ctx) { + const stolen = ctx.tools + return function hijackAttempt() { + stolen.curate = function() { return 'hijacked' } + return 'should-not-reach-here' + } + } + ` + const result = await seedAndLoad(code) + expect(result.loaded).to.equal(true) + if (!result.loaded) return // unreachable: Chai assertion above throws first — TypeScript narrowing only + if (result.module.curate === undefined) throw new Error('fixture must export curate') + + const returned = await result.module.curate(makeCtx()) + expect(typeof returned).to.equal('function', 'harness should return the closure') + + // Invoke the closure from outer test scope. Because `stolen` + // references the frozen `ctx.tools`, the `stolen.curate = ...` + // assignment throws TypeError synchronously. The return type of + // `module.curate` is declared as `CurateResult` but the harness + // intentionally returns a function here — double cast required + // to cross the declared/actual type boundary. + // + // Assertion note: the thrown `TypeError` comes from the VM's + // realm, not Node's outer realm, so `instanceof TypeError` + // against Node's class fails (distinct prototypes). Checking + // `error.name` is realm-agnostic and captures the exact intent: + // "a TypeError is thrown due to strict-mode frozen-property write." + try { + ;(returned as unknown as () => void)() + expect.fail('expected closure to throw on frozen mutation') + } catch (error) { + expect(error).to.be.an('error') + const err = error as Error + expect(err.name).to.equal('TypeError') + expect(err.message).to.match(/read.?only/i) + } + + await expectSandboxHealthy() + }) + + // ── Attack 3: Mutable parameter / frozen context ───────────────────────── + + it('3. Mutable parameter: harness cannot mutate its own ctx.env', async () => { + // In our architecture, the user doesn't pass their own object + // into `harness.curate(state)` — the module receives a + // deep-frozen HarnessContext built per-invocation. This test + // verifies mutation attempts on the received ctx fail, which is + // the architectural analog of "mutable parameter can't be + // weaponized." + const code = `${VALID_META} + exports.curate = async function curate(ctx) { + ctx.env.commandType = 'hacked' + return 'ok' + } + ` + const result = await seedAndLoad(code) + expect(result.loaded).to.equal(true) + if (!result.loaded) return // unreachable: Chai assertion above throws first — TypeScript narrowing only + if (result.module.curate === undefined) throw new Error('fixture must export curate') + + // What actually reaches this catch is a plain `Error` from the + // wrapper (`new Error('harness curate() failed: ' + msg)`), NOT a + // TypeError — the underlying TypeError is caught inside + // `wrapInvocation` and normalized before the async boundary. The + // underlying message is preserved in the wrapper's string, so + // asserting on `/read.?only/i` proves the root cause was a + // frozen-property write specifically — not just any harness + // failure that happened to bubble through. + try { + await result.module.curate(makeCtx()) + expect.fail('expected wrapped Error — ctx.env is frozen (TypeError inside VM)') + } catch (error) { + expect((error as Error).message).to.match(/curate\(\) failed.*read.?only/i) + } + + await expectSandboxHealthy() + }) + + // ── Attack 4: Prototype pollution ──────────────────────────────────────── + + it('4. Prototype pollution: harness `Object.prototype.X = ...` does NOT reach outer Object.prototype', async () => { + const code = `${VALID_META} + exports.curate = async function curate(ctx) { + Object.prototype.${ATTACK_4_PROTO_KEY} = 'polluted' + return 'ok' + } + ` + const result = await seedAndLoad(code) + expect(result.loaded).to.equal(true) + if (!result.loaded) return // unreachable: Chai assertion above throws first — TypeScript narrowing only + if (result.module.curate === undefined) throw new Error('fixture must export curate') + + await result.module.curate(makeCtx()) + + // V8 gives each `vm.createContext` its own realm with an + // independent Object.prototype. Mutations to the harness-side + // Object.prototype must not cross into Node's. + expect(({} as Record)[ATTACK_4_PROTO_KEY]).to.equal( + undefined, + 'harness prototype-pollution must not reach outer Object.prototype', + ) + + await expectSandboxHealthy() + }) + + // ── Attack 5: Stack-trace escape ───────────────────────────────────────── + + it('5. Stack-trace escape: attached properties on thrown errors do NOT escape the wrapper', async () => { + // The attack: harness throws an Error with a custom property + // (e.g. `capturedThis = globalThis`) attached. Without + // normalization, that property would reach outer `catch` blocks + // and give user-facing code a handle to the VM's globalThis. + // The wrapper's `throw new Error('harness curate() failed: ' + msg)` + // constructs a fresh Error — only the message survives. + const code = `${VALID_META} + exports.curate = async function curate(ctx) { + const err = new Error('boom') + err.capturedThis = globalThis + err.capturedArbitrary = { secret: 'do-not-leak' } + throw err + } + ` + const result = await seedAndLoad(code) + expect(result.loaded).to.equal(true) + if (!result.loaded) return // unreachable: Chai assertion above throws first — TypeScript narrowing only + if (result.module.curate === undefined) throw new Error('fixture must export curate') + + try { + await result.module.curate(makeCtx()) + expect.fail('expected throw') + } catch (error) { + expect(error).to.be.instanceOf(Error) + const caught = error as Error & {capturedArbitrary?: unknown; capturedThis?: unknown} + expect(caught.message).to.match(/curate\(\) failed/) + expect(caught.capturedThis).to.equal( + undefined, + 'wrapper must not propagate captured globalThis reference', + ) + expect(caught.capturedArbitrary).to.equal( + undefined, + 'wrapper must not propagate arbitrary captured properties', + ) + } + + await expectSandboxHealthy() + }) +}) From 46ea238263ecec307f90a9c43986486b84d59f8a Mon Sep 17 00:00:00 2001 From: Danh Doan Date: Tue, 21 Apr 2026 21:20:48 +0700 Subject: [PATCH 2/2] fix: stabilize FileQueryLogStore pruning test helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `waitForPruneToSettle` helper returned on the first two consecutive identical counts with 2ms polling, which under slow CI could catch the pre-prune state twice and declare "settled" before any work started. Require 5 consecutive stable readings at 5ms intervals (25ms of no change) with a 1s max bound — enough to ride out the initial async gap without inflating test runtime. --- .../storage/file-query-log-store.test.ts | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/test/integration/infra/storage/file-query-log-store.test.ts b/test/integration/infra/storage/file-query-log-store.test.ts index d9d509d06..a771cb3f4 100644 --- a/test/integration/infra/storage/file-query-log-store.test.ts +++ b/test/integration/infra/storage/file-query-log-store.test.ts @@ -47,7 +47,12 @@ async function generateIds(s: FileQueryLogStore, count: number): Promise { const count = async (): Promise => { try { @@ -58,15 +63,22 @@ async function waitForPruneToSettle(dir: string): Promise { } } + let stable = 0 let prev = await count() - for (let i = 0; i < 50; i++) { + for (let i = 0; i < 200; i++) { // eslint-disable-next-line no-await-in-loop await new Promise((r) => { - setTimeout(r, 2) + setTimeout(r, 5) }) // eslint-disable-next-line no-await-in-loop const cur = await count() - if (cur === prev) return + if (cur === prev) { + stable++ + if (stable >= 5) return + } else { + stable = 0 + } + prev = cur } }