From 34f9e39f0b6f9a0932dc9969bc154eb65f2f282c Mon Sep 17 00:00:00 2001
From: Danh Doan <danhdoancv@gmail.com>
Date: Tue, 21 Apr 2026 20:54:41 +0700
Subject: [PATCH 1/2] feat: [ENG-2243] AutoHarness V2 dual-VM isolation
 integration test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 3 Task 3.5 — closes brutal-review item A3 with 5 integration
tests covering every cross-VM attack vector. Each test exercises the
full pipeline (real `HarnessStore` + real `HarnessModuleBuilder` +
real `SandboxService.loadHarness`) against a harness fixture
designed to breach the sandbox boundary; the test asserts the breach
is contained and the pipeline stays healthy.

1. **Global pollution** — harness `globalThis.X = 'touched'` stays
in the VM's private `globalThis`; Node's `globalThis[X]` remains
`undefined` (V8 per-context realm).

2. **Closure leak** — harness returns a closure that captures
`ctx.tools` and tries `stolen.curate = hijackFn` when called.
Deep-freeze at the invocation boundary (ENG-2239) + strict-mode
injection means the mutation throws a VM-realm `TypeError` with
message matching `/read.?only/i`.

3. **Mutable parameter** — harness writes `ctx.env.commandType =
'hacked'`. Deep-freeze throws; wrapper normalizes as `curate()
failed: ...`.

4. **Prototype pollution** — harness `Object.prototype.X = 'polluted'`
stays in the VM realm's prototype; Node's `({} as any).X` remains
`undefined`.

5. **Stack-trace escape** — harness throws error with attached
`capturedThis: globalThis` AND `capturedArbitrary: {secret: ...}`.
Wrapper constructs a fresh `new Error('curate() failed: ' + msg)`
— only `message` survives, all attached properties drop.

**Cross-realm `instanceof` (Attack 2)**: TypeErrors from
`vm.createContext`
are instances of the VM realm's TypeError class, not Node's outer
realm. `error instanceof TypeError` fails despite the error being
structurally a TypeError. Fix: check `error.name === 'TypeError'`
(string-valued, realm-agnostic) + `error.message` match.

**`capturedArbitrary` property (Attack 5)**: attack attaches two
properties to the thrown error — a known one (`capturedThis`) and an
arbitrary one (`capturedArbitrary`). Proves the wrapper strips
*all* attached properties by constructing a fresh Error, not just
the ones it explicitly knows about.

Per-attack `expectSandboxHealthy()` now asserts BOTH `2 + 2 === 4`
(JS intact) AND `typeof tools === 'object' && typeof tools.readFile
=== 'function'` (tools namespace wired). A single sandbox snapshot
proves "attack didn't destabilize execution OR tool injection."

Uses `FileKeyStorage({inMemory: true})` instead of the task doc's
suggested `mkdtemp` + disk. Attacks don't need real I/O; in-memory
storage means zero disk footprint, zero tmpdir collision surface, zero
cleanup logic.

Full suite: 6710 passing / 0 failing; 59ms for this file's 5 tests.
---
 .../agent/harness/isolation.test.ts           | 341 ++++++++++++++++++
 1 file changed, 341 insertions(+)
 create mode 100644 test/integration/agent/harness/isolation.test.ts

diff --git a/test/integration/agent/harness/isolation.test.ts b/test/integration/agent/harness/isolation.test.ts
new file mode 100644
index 000000000..b08241403
--- /dev/null
+++ b/test/integration/agent/harness/isolation.test.ts
@@ -0,0 +1,341 @@
+/**
+ * AutoHarness V2 — Phase 3 Task 3.5 dual-VM isolation integration test.
+ *
+ * Closes brutal-review item A3: the five enumerated attack vectors
+ * that MUST NOT cross between a harness's `vm.createContext` and the
+ * outer Node process. These tests are Phase 3's security proof of
+ * work — if any scenario regresses, the harness sandbox is
+ * structurally compromised.
+ *
+ * Each attack is exercised end-to-end against the real pipeline:
+ * real `HarnessStore` (in-memory-backed `FileKeyStorage`), real
+ * `HarnessModuleBuilder`, real `SandboxService.loadHarness`. The
+ * test runs `result.module.curate(ctx)` so the wrapper's deep-freeze
+ * + strict-mode + VM timeout apply at every invocation boundary.
+ *
+ * Invariants per attack:
+ *
+ *   1. The attempted leak does NOT reach the outer Node scope (or
+ *      the test's object-literal prototype, as appropriate).
+ *   2. The loader pipeline stays healthy after the attack — a
+ *      subsequent `expectSandboxHealthy()` confirms no state
+ *      corruption.
+ */
+
+import {expect} from 'chai'
+import {createSandbox, type SinonSandbox} from 'sinon'
+
+import type {
+  HarnessContext,
+  HarnessVersion,
+} from '../../../../src/agent/core/domain/harness/types.js'
+import type {IFileSystem} from '../../../../src/agent/core/interfaces/i-file-system.js'
+import type {ValidatedHarnessConfig} from '../../../../src/agent/infra/agent/agent-schemas.js'
+
+import {NoOpLogger} from '../../../../src/agent/core/interfaces/i-logger.js'
+import {HarnessModuleBuilder} from '../../../../src/agent/infra/harness/harness-module-builder.js'
+import {HarnessStore} from '../../../../src/agent/infra/harness/harness-store.js'
+import {SandboxService} from '../../../../src/agent/infra/sandbox/sandbox-service.js'
+import {FileKeyStorage} from '../../../../src/agent/infra/storage/file-key-storage.js'
+
+const VALID_META = `
+  exports.meta = function meta() {
+    return {
+      capabilities: ['curate'],
+      commandType: 'curate',
+      projectPatterns: [],
+      version: 1,
+    }
+  }
+`
+
+// Unique sentinels per attack so cross-test contamination (if any
+// security failure leaks) surfaces in the right test, not a random one.
+const ATTACK_1_GLOBAL_KEY = '__HARNESS_ISOLATION_TEST_1_LEAK__'
+const ATTACK_4_PROTO_KEY = '__harnessIsolationTest4Pollution__'
+
+function makeVersion(code: string): HarnessVersion {
+  return {
+    code,
+    commandType: 'curate',
+    createdAt: 1_700_000_000_000,
+    heuristic: 0.3,
+    id: 'v-1',
+    metadata: {
+      capabilities: ['curate'],
+      commandType: 'curate',
+      projectPatterns: [],
+      version: 1,
+    },
+    projectId: 'p',
+    projectType: 'typescript',
+    version: 1,
+  }
+}
+
+function makeCtx(): HarnessContext {
+  return {
+    abort: new AbortController().signal,
+    env: {commandType: 'curate', projectType: 'typescript', workingDirectory: '/'},
+    tools: {
+      // Stubs — attacks 2 and 3 reference `ctx.tools` as a capture
+      // target, but no attack actually calls a real tool. A future
+      // attack that exercises a live tool call should replace these
+      // with session-bound fixtures.
+      curate: (async () => ({})) as unknown as HarnessContext['tools']['curate'],
+      readFile: (async () => ({})) as unknown as HarnessContext['tools']['readFile'],
+    },
+  }
+}
+
+function makeEnabledConfig(): ValidatedHarnessConfig {
+  return {autoLearn: true, enabled: true, language: 'typescript', maxVersions: 20}
+}
+
+function makeStubFileSystem(sb: SinonSandbox): IFileSystem {
+  // Matches the established sandbox-test stub pattern. None of the
+  // isolation attacks invoke file-system methods through the sandbox.
+  return {
+    editFile: sb.stub().resolves({bytesWritten: 0, path: '/'}),
+    globFiles: sb.stub().resolves({files: [], totalFound: 0, truncated: false}),
+    initialize: sb.stub(),
+    listDirectory: sb.stub().resolves({files: [], tree: '', truncated: false}),
+    readFile: sb.stub().resolves({content: '', exists: true, path: '/'}),
+    searchContent: sb.stub().resolves({matches: [], totalMatches: 0, truncated: false}),
+    writeFile: sb.stub().resolves({bytesWritten: 0, path: '/'}),
+  } as unknown as IFileSystem
+}
+
+describe('dual-VM isolation — brutal-review A3', () => {
+  let sb: SinonSandbox
+  let service: SandboxService
+  let store: HarnessStore
+
+  beforeEach(async () => {
+    sb = createSandbox()
+    const keyStorage = new FileKeyStorage({inMemory: true})
+    await keyStorage.initialize()
+    store = new HarnessStore(keyStorage, new NoOpLogger())
+    const builder = new HarnessModuleBuilder(new NoOpLogger())
+
+    service = new SandboxService()
+    service.setHarnessConfig(makeEnabledConfig())
+    service.setHarnessStore(store)
+    service.setHarnessModuleBuilder(builder)
+    service.setFileSystem(makeStubFileSystem(sb))
+  })
+
+  afterEach(() => {
+    sb.restore()
+    // Defensive cleanup — if a security failure somehow let the attack
+    // succeed, we don't want the pollution bleeding into other tests
+    // (or the broader test suite).
+    delete (globalThis as Record<string, unknown>)[ATTACK_1_GLOBAL_KEY]
+    delete (Object.prototype as Record<string, unknown>)[ATTACK_4_PROTO_KEY]
+  })
+
+  async function seedAndLoad(code: string) {
+    await store.saveVersion(makeVersion(code))
+    return service.loadHarness('s1', 'p', 'curate')
+  }
+
+  async function expectSandboxHealthy(): Promise<void> {
+    // Prove two things in one check: JS evaluation is intact AND the
+    // tools namespace is still wired. The AC for A3 says "subsequent
+    // raw tools.* calls still work" — this snapshot captures both
+    // properties without needing async-Promise gymnastics inside
+    // the sandbox.
+    const result = await service.executeCode(
+      `({math: 2 + 2, hasTools: typeof tools === 'object' && typeof tools.readFile === 'function'})`,
+      's1',
+    )
+    expect(result.returnValue).to.deep.equal(
+      {hasTools: true, math: 4},
+      'sandbox must stay healthy after attack: JS + tools namespace intact',
+    )
+  }
+
+  // ── Attack 1: Global pollution ────────────────────────────────────────────
+
+  it('1. Global pollution: harness `globalThis.X = ...` does NOT reach Node globalThis', async () => {
+    const code = `${VALID_META}
+      exports.curate = async function curate(ctx) {
+        globalThis.${ATTACK_1_GLOBAL_KEY} = 'touched'
+        return 'ok'
+      }
+    `
+    const result = await seedAndLoad(code)
+    expect(result.loaded).to.equal(true)
+    if (!result.loaded) return // unreachable: Chai assertion above throws first — TypeScript narrowing only
+    if (result.module.curate === undefined) throw new Error('fixture must export curate')
+
+    await result.module.curate(makeCtx())
+
+    // The harness set `globalThis.<KEY>` inside its own VM context.
+    // V8's `vm.createContext({})` gives each context its own
+    // globalThis, so Node's globalThis must remain unmodified.
+    expect((globalThis as Record<string, unknown>)[ATTACK_1_GLOBAL_KEY]).to.equal(
+      undefined,
+      'harness global-pollution must not reach Node globalThis',
+    )
+
+    await expectSandboxHealthy()
+  })
+
+  // ── Attack 2: Closure leak ────────────────────────────────────────────────
+
+  it('2. Closure leak: returned closure cannot mutate captured ctx.tools', async () => {
+    // The attack: harness returns a closure that closes over
+    // `ctx.tools` and attempts to rebind `stolen.curate = hijackFn`.
+    // Deep-freeze at the invocation boundary (HarnessModuleBuilder
+    // wrapper) means `stolen` is a frozen object — the mutation
+    // throws in strict mode.
+    const code = `${VALID_META}
+      exports.curate = async function curate(ctx) {
+        const stolen = ctx.tools
+        return function hijackAttempt() {
+          stolen.curate = function() { return 'hijacked' }
+          return 'should-not-reach-here'
+        }
+      }
+    `
+    const result = await seedAndLoad(code)
+    expect(result.loaded).to.equal(true)
+    if (!result.loaded) return // unreachable: Chai assertion above throws first — TypeScript narrowing only
+    if (result.module.curate === undefined) throw new Error('fixture must export curate')
+
+    const returned = await result.module.curate(makeCtx())
+    expect(typeof returned).to.equal('function', 'harness should return the closure')
+
+    // Invoke the closure from outer test scope. Because `stolen`
+    // references the frozen `ctx.tools`, the `stolen.curate = ...`
+    // assignment throws TypeError synchronously. The return type of
+    // `module.curate` is declared as `CurateResult` but the harness
+    // intentionally returns a function here — double cast required
+    // to cross the declared/actual type boundary.
+    //
+    // Assertion note: the thrown `TypeError` comes from the VM's
+    // realm, not Node's outer realm, so `instanceof TypeError`
+    // against Node's class fails (distinct prototypes). Checking
+    // `error.name` is realm-agnostic and captures the exact intent:
+    // "a TypeError is thrown due to strict-mode frozen-property write."
+    try {
+      ;(returned as unknown as () => void)()
+      expect.fail('expected closure to throw on frozen mutation')
+    } catch (error) {
+      expect(error).to.be.an('error')
+      const err = error as Error
+      expect(err.name).to.equal('TypeError')
+      expect(err.message).to.match(/read.?only/i)
+    }
+
+    await expectSandboxHealthy()
+  })
+
+  // ── Attack 3: Mutable parameter / frozen context ─────────────────────────
+
+  it('3. Mutable parameter: harness cannot mutate its own ctx.env', async () => {
+    // In our architecture, the user doesn't pass their own object
+    // into `harness.curate(state)` — the module receives a
+    // deep-frozen HarnessContext built per-invocation. This test
+    // verifies mutation attempts on the received ctx fail, which is
+    // the architectural analog of "mutable parameter can't be
+    // weaponized."
+    const code = `${VALID_META}
+      exports.curate = async function curate(ctx) {
+        ctx.env.commandType = 'hacked'
+        return 'ok'
+      }
+    `
+    const result = await seedAndLoad(code)
+    expect(result.loaded).to.equal(true)
+    if (!result.loaded) return // unreachable: Chai assertion above throws first — TypeScript narrowing only
+    if (result.module.curate === undefined) throw new Error('fixture must export curate')
+
+    // What actually reaches this catch is a plain `Error` from the
+    // wrapper (`new Error('harness curate() failed: ' + msg)`), NOT a
+    // TypeError — the underlying TypeError is caught inside
+    // `wrapInvocation` and normalized before the async boundary. The
+    // underlying message is preserved in the wrapper's string, so
+    // asserting on `/read.?only/i` proves the root cause was a
+    // frozen-property write specifically — not just any harness
+    // failure that happened to bubble through.
+    try {
+      await result.module.curate(makeCtx())
+      expect.fail('expected wrapped Error — ctx.env is frozen (TypeError inside VM)')
+    } catch (error) {
+      expect((error as Error).message).to.match(/curate\(\) failed.*read.?only/i)
+    }
+
+    await expectSandboxHealthy()
+  })
+
+  // ── Attack 4: Prototype pollution ────────────────────────────────────────
+
+  it('4. Prototype pollution: harness `Object.prototype.X = ...` does NOT reach outer Object.prototype', async () => {
+    const code = `${VALID_META}
+      exports.curate = async function curate(ctx) {
+        Object.prototype.${ATTACK_4_PROTO_KEY} = 'polluted'
+        return 'ok'
+      }
+    `
+    const result = await seedAndLoad(code)
+    expect(result.loaded).to.equal(true)
+    if (!result.loaded) return // unreachable: Chai assertion above throws first — TypeScript narrowing only
+    if (result.module.curate === undefined) throw new Error('fixture must export curate')
+
+    await result.module.curate(makeCtx())
+
+    // V8 gives each `vm.createContext` its own realm with an
+    // independent Object.prototype. Mutations to the harness-side
+    // Object.prototype must not cross into Node's.
+    expect(({} as Record<string, unknown>)[ATTACK_4_PROTO_KEY]).to.equal(
+      undefined,
+      'harness prototype-pollution must not reach outer Object.prototype',
+    )
+
+    await expectSandboxHealthy()
+  })
+
+  // ── Attack 5: Stack-trace escape ─────────────────────────────────────────
+
+  it('5. Stack-trace escape: attached properties on thrown errors do NOT escape the wrapper', async () => {
+    // The attack: harness throws an Error with a custom property
+    // (e.g. `capturedThis = globalThis`) attached. Without
+    // normalization, that property would reach outer `catch` blocks
+    // and give user-facing code a handle to the VM's globalThis.
+    // The wrapper's `throw new Error('harness curate() failed: ' + msg)`
+    // constructs a fresh Error — only the message survives.
+    const code = `${VALID_META}
+      exports.curate = async function curate(ctx) {
+        const err = new Error('boom')
+        err.capturedThis = globalThis
+        err.capturedArbitrary = { secret: 'do-not-leak' }
+        throw err
+      }
+    `
+    const result = await seedAndLoad(code)
+    expect(result.loaded).to.equal(true)
+    if (!result.loaded) return // unreachable: Chai assertion above throws first — TypeScript narrowing only
+    if (result.module.curate === undefined) throw new Error('fixture must export curate')
+
+    try {
+      await result.module.curate(makeCtx())
+      expect.fail('expected throw')
+    } catch (error) {
+      expect(error).to.be.instanceOf(Error)
+      const caught = error as Error & {capturedArbitrary?: unknown; capturedThis?: unknown}
+      expect(caught.message).to.match(/curate\(\) failed/)
+      expect(caught.capturedThis).to.equal(
+        undefined,
+        'wrapper must not propagate captured globalThis reference',
+      )
+      expect(caught.capturedArbitrary).to.equal(
+        undefined,
+        'wrapper must not propagate arbitrary captured properties',
+      )
+    }
+
+    await expectSandboxHealthy()
+  })
+})

From 46ea238263ecec307f90a9c43986486b84d59f8a Mon Sep 17 00:00:00 2001
From: Danh Doan <danhdoancv@gmail.com>
Date: Tue, 21 Apr 2026 21:20:48 +0700
Subject: [PATCH 2/2] fix: stabilize FileQueryLogStore pruning test helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `waitForPruneToSettle` helper returned on the first two consecutive
identical counts with 2ms polling, which under slow CI could catch the
pre-prune state twice and declare "settled" before any work started.

Require 5 consecutive stable readings at 5ms intervals (25ms of no
change) with a 1s max bound — enough to ride out the initial async gap
without inflating test runtime.
---
 .../storage/file-query-log-store.test.ts      | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/test/integration/infra/storage/file-query-log-store.test.ts b/test/integration/infra/storage/file-query-log-store.test.ts
index d9d509d06..a771cb3f4 100644
--- a/test/integration/infra/storage/file-query-log-store.test.ts
+++ b/test/integration/infra/storage/file-query-log-store.test.ts
@@ -47,7 +47,12 @@ async function generateIds(s: FileQueryLogStore, count: number): Promise<string[
   return ids
 }
 
-/** Poll until the .json file count in dir stabilises (two consecutive reads match). */
+/**
+ * Poll until the .json file count in dir stabilises. Requires 5 consecutive
+ * stable readings (25ms of no change) to avoid a race under slow CI where the
+ * async prune hasn't started yet — two consecutive identical pre-prune counts
+ * would otherwise declare "settled" before any work happened.
+ */
 async function waitForPruneToSettle(dir: string): Promise<void> {
   const count = async (): Promise<number> => {
     try {
@@ -58,15 +63,22 @@ async function waitForPruneToSettle(dir: string): Promise<void> {
     }
   }
 
+  let stable = 0
   let prev = await count()
-  for (let i = 0; i < 50; i++) {
+  for (let i = 0; i < 200; i++) {
     // eslint-disable-next-line no-await-in-loop
     await new Promise((r) => {
-      setTimeout(r, 2)
+      setTimeout(r, 5)
     })
     // eslint-disable-next-line no-await-in-loop
     const cur = await count()
-    if (cur === prev) return
+    if (cur === prev) {
+      stable++
+      if (stable >= 5) return
+    } else {
+      stable = 0
+    }
+
     prev = cur
   }
 }