diff --git a/.github/workflows/player-perf.yml b/.github/workflows/player-perf.yml index 937a60113..cee22abec 100644 --- a/.github/workflows/player-perf.yml +++ b/.github/workflows/player-perf.yml @@ -42,6 +42,15 @@ jobs: - shard: load scenarios: load runs: "5" + - shard: fps + scenarios: fps + runs: "3" + - shard: scrub + scenarios: scrub + runs: "3" + - shard: drift + scenarios: drift + runs: "3" steps: - uses: actions/checkout@v4 diff --git a/packages/player/tests/perf/baseline.json b/packages/player/tests/perf/baseline.json index 52211e710..dbe0e7466 100644 --- a/packages/player/tests/perf/baseline.json +++ b/packages/player/tests/perf/baseline.json @@ -1,7 +1,7 @@ { "compLoadColdP95Ms": 2000, "compLoadWarmP95Ms": 1000, - "fpsMin": 55, + "compositionTimeAdvancementRatioMin": 0.95, "scrubLatencyP95IsolatedMs": 80, "scrubLatencyP95InlineMs": 33, "driftMaxMs": 500, diff --git a/packages/player/tests/perf/fixtures/10-video-grid/index.html b/packages/player/tests/perf/fixtures/10-video-grid/index.html new file mode 100644 index 000000000..b4ec61add --- /dev/null +++ b/packages/player/tests/perf/fixtures/10-video-grid/index.html @@ -0,0 +1,126 @@ + + + + + perf fixture: 10-video-grid + + + + + +
+ + + diff --git a/packages/player/tests/perf/fixtures/10-video-grid/sample.mp4 b/packages/player/tests/perf/fixtures/10-video-grid/sample.mp4 new file mode 100644 index 000000000..1cf4df9e1 Binary files /dev/null and b/packages/player/tests/perf/fixtures/10-video-grid/sample.mp4 differ diff --git a/packages/player/tests/perf/index.ts b/packages/player/tests/perf/index.ts index c6942ae9d..e95f2b901 100644 --- a/packages/player/tests/perf/index.ts +++ b/packages/player/tests/perf/index.ts @@ -29,7 +29,10 @@ import { execFileSync } from "node:child_process"; import { existsSync, mkdirSync, writeFileSync } from "node:fs"; import { dirname, resolve } from "node:path"; import { fileURLToPath } from "node:url"; +import { runFps } from "./scenarios/02-fps.ts"; import { runLoad } from "./scenarios/03-load.ts"; +import { runScrub } from "./scenarios/04-scrub.ts"; +import { runDrift } from "./scenarios/05-drift.ts"; import { reportAndGate, type GateMode, type GateResult, type Metric } from "./perf-gate.ts"; import { launchBrowser } from "./runner.ts"; import { startServer } from "./server.ts"; @@ -38,7 +41,42 @@ const HERE = dirname(fileURLToPath(import.meta.url)); const RESULTS_DIR = resolve(HERE, "results"); const RESULTS_FILE = resolve(RESULTS_DIR, "metrics.json"); -type ScenarioId = "load"; +type ScenarioId = "load" | "fps" | "scrub" | "drift"; + +/** + * Per-scenario default `runs` value when the caller didn't pass `--runs`. + * + * Why `load` gets 5 runs and the others get 3: + * + * - `load` reports a single p95 over `runs` measurements, so each `run` is + * one sample. p95 over n=3 is mostly noise (the 95th percentile of three + * numbers is just `max`), so we bump it to 5. We considered 10 — but cold + * load is the slowest scenario in the shard (~2s × 5 runs × 2 fixtures = + * ~20s with disk cache cleared), and going to 10 would push the load shard + * past 30s of pure-measurement wall time per CI invocation. + * - `fps` aggregates as `min(ratio)` over runs — 3 runs gives us a worst- + * of-three signal, which is what we want for a floor metric. Adding more + * runs would only make the ratio strictly smaller (more chances to catch + * a stall) and shift the threshold toward false positives from runner + * contention rather than real regressions. + * - `scrub` and `drift` *pool* their per-run samples (10 seeks/run for + * scrub, ~1500 RVFC frames/run for drift) and compute the percentile over + * the pooled set. Their effective sample count for the percentile is + * `runs × samples_per_run`, not `runs`, so 3 runs already gives 30+ scrub + * samples and 4500+ drift samples per shard — well above the n≈30 rule of + * thumb for a stable p95. + * + * TODO(player-perf): revisit `fps: 3` once we have ~2 weeks of CI baseline + * data — if `min(ratio)` shows >5% inter-run variance attributable to runner + * jitter (not real player regressions), bump to 5 and tighten the + * `compositionTimeAdvancementRatioMin` baseline accordingly. + */ +const DEFAULT_RUNS: Record = { + load: 5, + fps: 3, + scrub: 3, + drift: 3, +}; type ResultsFile = { schemaVersion: 1; @@ -88,7 +126,7 @@ function parseArgs(argv: string[]): ParsedArgs { // `mode` is consumed (measure logs regressions but never fails; enforce // exits non-zero on regression). mode: (process.env.PLAYER_PERF_MODE as GateMode) === "enforce" ? "enforce" : "measure", - scenarios: ["load"], + scenarios: ["load", "fps", "scrub", "drift"], runs: null, fixture: null, headful: false, @@ -150,7 +188,31 @@ async function main(): Promise { const m = await runLoad({ browser, origin: server.origin, - runs: args.runs ?? 5, + runs: args.runs ?? DEFAULT_RUNS.load, + fixture: args.fixture, + }); + metrics.push(...m); + } else if (scenario === "fps") { + const m = await runFps({ + browser, + origin: server.origin, + runs: args.runs ?? DEFAULT_RUNS.fps, + fixture: args.fixture, + }); + metrics.push(...m); + } else if (scenario === "scrub") { + const m = await runScrub({ + browser, + origin: server.origin, + runs: args.runs ?? DEFAULT_RUNS.scrub, + fixture: args.fixture, + }); + metrics.push(...m); + } else if (scenario === "drift") { + const m = await runDrift({ + browser, + origin: server.origin, + runs: args.runs ?? DEFAULT_RUNS.drift, fixture: args.fixture, }); metrics.push(...m); diff --git a/packages/player/tests/perf/perf-gate.ts b/packages/player/tests/perf/perf-gate.ts index 60cf7f52e..86cd75ddf 100644 --- a/packages/player/tests/perf/perf-gate.ts +++ b/packages/player/tests/perf/perf-gate.ts @@ -31,7 +31,16 @@ export type Metric = { export type PerfBaseline = { compLoadColdP95Ms: number; compLoadWarmP95Ms: number; - fpsMin: number; + /** + * Floor on `(compositionTime advanced) / (wallClock elapsed)` over a sustained + * playback window — see packages/player/tests/perf/scenarios/02-fps.ts. A + * healthy player keeps up with its intended speed and reads ~1.0; values + * below 1.0 mean the composition clock fell behind real time, which is the + * actual user-visible jank we want to gate against. Refresh-rate independent + * by construction, so it does not saturate to display refresh on high-Hz + * runners the way the previous `fpsMin` did. Direction: higher-is-better. + */ + compositionTimeAdvancementRatioMin: number; scrubLatencyP95IsolatedMs: number; scrubLatencyP95InlineMs: number; driftMaxMs: number; diff --git a/packages/player/tests/perf/scenarios/02-fps.ts b/packages/player/tests/perf/scenarios/02-fps.ts new file mode 100644 index 000000000..ce04595bd --- /dev/null +++ b/packages/player/tests/perf/scenarios/02-fps.ts @@ -0,0 +1,236 @@ +/** + * Scenario 02: sustained playback against the composition clock. + * + * Loads the 10-video-grid fixture, calls `player.play()`, then samples + * `__player.getTime()` at fixed wall-clock intervals for ~5 seconds. The + * emitted metric is the ratio of composition-time advanced to wall-clock + * elapsed: + * + * composition_time_advancement_ratio = (getTime(end) - getTime(start)) / wallSeconds + * + * This reads ~1.0 when the runtime is keeping up with its intended playback + * speed and falls below 1.0 when the player stalls — a slow video decoder, a + * blocked main thread, a GC pause, anything that prevents the composition + * clock from advancing at real-time. The metric is independent of the host + * display refresh rate by construction: both numerator and denominator are + * wall-clock timestamps, neither is a frame count, so a 60Hz, 120Hz, or 240Hz + * runner sees the same value for a healthy player. + * + * Why we replaced the previous rAF-based FPS metric: + * The original implementation counted `requestAnimationFrame` ticks per + * wall-clock second and asserted `fps >= 55`. On a 120Hz CI runner that + * reads ~120 fps regardless of whether the composition is actually + * advancing, so the gate passed even when the player was silently stalling. + * See PR #400 review (jrusso1020 + miguel-heygen) for the full discussion; + * this implementation follows jrusso1020's "first choice" recommendation. + * + * Per the proposal: + * Test 1: Playback frame rate (player-perf-fps) + * Load 10-video composition → play 5s → measure how well the player kept + * up with the composition clock. + * + * Methodology details: + * - We install the wall-clock sampler before calling `play()` so the very + * first post-play tick is captured. We then wait for `__player.isPlaying()` + * to flip true (the parent→iframe `play` message is async via postMessage) + * and *reset* the sample buffer, so the measurement window only contains + * samples taken while the runtime was actively playing the timeline. + * - Sampling cadence is 100ms (10 samples/sec). That's fine-grained enough + * to spot a half-second stall but coarse enough that the sampler itself + * has negligible overhead. With a 5s window we collect ~50 samples; the + * ratio is computed from the first and last sample's `getTime()` values. + * - We use `setInterval` (not rAF) on purpose: rAF cadence is the metric we + * are trying to *avoid* depending on. `setInterval` is wall-clock-driven. + * + * Outputs one metric: + * - composition_time_advancement_ratio_min + * (higher-is-better, baseline key compositionTimeAdvancementRatioMin) + * + * Aggregation: `min(ratio)` across runs because the proposal asserts a floor + * — the worst run is the one that gates against regressions. + */ + +import type { Browser, Frame, Page } from "puppeteer-core"; +import { loadHostPage } from "../runner.ts"; +import type { Metric } from "../perf-gate.ts"; + +export type FpsScenarioOpts = { + browser: Browser; + origin: string; + /** Number of measurement runs. */ + runs: number; + /** If null, runs the default fixture (10-video-grid). */ + fixture: string | null; +}; + +const DEFAULT_FIXTURE = "10-video-grid"; +const PLAYBACK_DURATION_MS = 5_000; +const SAMPLE_INTERVAL_MS = 100; +const PLAY_CONFIRM_TIMEOUT_MS = 5_000; +const FRAME_LOOKUP_TIMEOUT_MS = 5_000; + +declare global { + interface Window { + /** (wallClockMs, compositionTimeSec) pairs collected by the sampler. */ + __perfPlaySamples?: Array<{ wall: number; comp: number }>; + /** setInterval handle used by the sampler; cleared at the end of the window. */ + __perfPlaySamplerHandle?: number; + /** Hyperframes runtime player API exposed inside the composition iframe. */ + __player?: { + play: () => void; + pause: () => void; + seek: (timeSeconds: number) => void; + getTime: () => number; + getDuration: () => number; + isPlaying: () => boolean; + }; + } +} + +type RunResult = { + ratio: number; + compElapsedSec: number; + wallElapsedSec: number; + samples: number; +}; + +/** + * Find the iframe Puppeteer Frame that hosts the fixture composition. The + * `` shell wraps an iframe whose URL is derived from the + * player's `src` attribute, so we match by path substring rather than full URL. + */ +async function getFixtureFrame(page: Page, fixture: string): Promise { + const expected = `/fixtures/${fixture}/`; + const deadline = Date.now() + FRAME_LOOKUP_TIMEOUT_MS; + while (Date.now() < deadline) { + const frame = page.frames().find((f) => f.url().includes(expected)); + if (frame) return frame; + await new Promise((r) => setTimeout(r, 50)); + } + throw new Error(`[scenario:fps] fixture frame not found for "${fixture}" within timeout`); +} + +async function runOnce( + opts: FpsScenarioOpts, + fixture: string, + idx: number, + total: number, +): Promise { + const ctx = await opts.browser.createBrowserContext(); + try { + const page = await ctx.newPage(); + const { duration } = await loadHostPage(page, opts.origin, { fixture }); + const frame = await getFixtureFrame(page, fixture); + + // Install the wall-clock sampler in the iframe context. We use setInterval + // because rAF cadence is exactly the host-display-dependent signal we are + // trying NOT to depend on; setInterval is driven by the event loop and + // gives us samples at fixed wall-clock cadence regardless of refresh rate. + await frame.evaluate((sampleIntervalMs: number) => { + window.__perfPlaySamples = []; + window.__perfPlaySamplerHandle = window.setInterval(() => { + const comp = window.__player?.getTime?.(); + if (typeof comp !== "number" || !Number.isFinite(comp)) return; + window.__perfPlaySamples!.push({ + wall: performance.timeOrigin + performance.now(), + comp, + }); + }, sampleIntervalMs); + }, SAMPLE_INTERVAL_MS); + + // Issue play from the host page (parent of the iframe). The player's + // public `play()` posts a control message into the iframe. + await page.evaluate(() => { + const el = document.getElementById("player") as (HTMLElement & { play: () => void }) | null; + if (!el) throw new Error("[scenario:fps] player element missing on host page"); + el.play(); + }); + + // Wait for the runtime to actually transition to playing — this is the + // signal that the postMessage round trip + timeline.play() finished. + await frame.waitForFunction(() => window.__player?.isPlaying?.() === true, { + timeout: PLAY_CONFIRM_TIMEOUT_MS, + }); + + // Reset samples now that playback is confirmed running. Anything captured + // before this point belongs to the ramp-up window (composition clock at + // 0, wall clock advancing) and would skew the ratio toward 0. + await frame.evaluate(() => { + window.__perfPlaySamples = []; + }); + + // Sustain playback for the measurement window. + await new Promise((r) => setTimeout(r, PLAYBACK_DURATION_MS)); + + // Stop the sampler and harvest the samples before pausing the runtime, + // so the pause command can't perturb the tail of the sample window. + const samples = (await frame.evaluate(() => { + if (window.__perfPlaySamplerHandle !== undefined) { + clearInterval(window.__perfPlaySamplerHandle); + window.__perfPlaySamplerHandle = undefined; + } + return window.__perfPlaySamples ?? []; + })) as Array<{ wall: number; comp: number }>; + + await page.evaluate(() => { + const el = document.getElementById("player") as (HTMLElement & { pause: () => void }) | null; + el?.pause(); + }); + + if (samples.length < 2) { + throw new Error( + `[scenario:fps] run ${idx + 1}/${total}: only ${samples.length} composition-clock samples captured (composition duration ${duration}s)`, + ); + } + + const first = samples[0]!; + const last = samples[samples.length - 1]!; + const wallElapsedSec = (last.wall - first.wall) / 1000; + const compElapsedSec = last.comp - first.comp; + const ratio = wallElapsedSec > 0 ? compElapsedSec / wallElapsedSec : 0; + + console.log( + `[scenario:fps] run[${idx + 1}/${total}] ratio=${ratio.toFixed(4)} compElapsed=${compElapsedSec.toFixed(3)}s wallElapsed=${wallElapsedSec.toFixed(3)}s samples=${samples.length}`, + ); + + await page.close(); + return { + ratio, + compElapsedSec, + wallElapsedSec, + samples: samples.length, + }; + } finally { + await ctx.close(); + } +} + +export async function runFps(opts: FpsScenarioOpts): Promise { + const fixture = opts.fixture ?? DEFAULT_FIXTURE; + const runs = Math.max(1, opts.runs); + console.log( + `[scenario:fps] fixture=${fixture} runs=${runs} window=${PLAYBACK_DURATION_MS}ms sampleInterval=${SAMPLE_INTERVAL_MS}ms`, + ); + + const ratios: number[] = []; + for (let i = 0; i < runs; i++) { + const result = await runOnce(opts, fixture, i, runs); + ratios.push(result.ratio); + } + + // Worst run wins: the proposal asserts a floor on this ratio, so a single + // bad run (slow decoder, GC pause, host contention) is the one that gates. + const ratioMin = Math.min(...ratios); + console.log(`[scenario:fps] aggregate min ratio=${ratioMin.toFixed(4)} runs=${runs}`); + + return [ + { + name: "composition_time_advancement_ratio_min", + baselineKey: "compositionTimeAdvancementRatioMin", + value: ratioMin, + unit: "ratio", + direction: "higher-is-better", + samples: ratios, + }, + ]; +} diff --git a/packages/player/tests/perf/scenarios/04-scrub.ts b/packages/player/tests/perf/scenarios/04-scrub.ts new file mode 100644 index 000000000..0c063c26f --- /dev/null +++ b/packages/player/tests/perf/scenarios/04-scrub.ts @@ -0,0 +1,307 @@ +/** + * Scenario 04: scrub latency. + * + * Loads the 10-video-grid fixture, pauses the player, then issues 10 seek + * calls in sequence — first through the synchronous "inline" path, then + * through the postMessage-driven "isolated" path — and measures the wall-clock + * latency from each `seek()` call to the first paint where the iframe's + * timeline reports the new time. + * + * Per the proposal: + * Test 2: Scrub latency (player-perf-scrub) + * Load composition → seek to 10 positions in sequence → measure time + * from seek() call to state update callback + * Assert: p95 < 80ms (isolated), p95 < 33ms (inline, Phase 4+) + * + * Methodology details: + * - Both modes are measured in the same page load. Inline runs first so + * the isolated mode's monkey-patch (forcing `_trySyncSeek` to return + * false) doesn't bleed into the inline samples. + * - "Inline" mode is the default behavior of `` when the + * iframe is same-origin and exposes `__player.seek()` synchronously. + * `seek()` lands the new frame in the same task as the input event. + * - "Isolated" mode is forced by replacing the player element's + * `_trySyncSeek` method with `() => false`, which sends the player + * element through the postMessage bridge — exactly what cross-origin + * embeds and Phase 1 (pre-sync) builds did. + * - Detection is via a `requestAnimationFrame` watcher inside the iframe + * that polls `__player.getTime()` until it is within `MATCH_TOLERANCE_S` + * of the requested target. We use a tolerance because the postMessage + * bridge converts seconds → frame number → seconds, which can introduce + * sub-frame quantization drift even for targets on the canonical fps grid. + * - Timing uses `performance.timeOrigin + performance.now()` in both the + * host and iframe contexts. `timeOrigin` is consistent across same-process + * frames, so the difference is a true wall-clock measurement of latency. + * - Seek targets alternate forward/backward across the 10s composition so + * no two consecutive seeks land near each other; this avoids the rAF + * watcher matching against a stale `getTime()` value before the seek + * command is processed. + * + * Outputs two metrics: + * - scrub_latency_p95_inline_ms (lower-is-better, baseline scrubLatencyP95InlineMs) + * - scrub_latency_p95_isolated_ms (lower-is-better, baseline scrubLatencyP95IsolatedMs) + * + * Aggregation: percentile(95) is computed across the pooled per-seek + * latencies from every run. With 10 seeks per mode per run × 3 runs we get + * 30 samples per mode per CI shard, which is enough for a stable p95. + */ + +import type { Browser, Frame, Page } from "puppeteer-core"; +import { loadHostPage, percentile } from "../runner.ts"; +import type { Metric } from "../perf-gate.ts"; + +export type ScrubScenarioOpts = { + browser: Browser; + origin: string; + /** Number of measurement runs. */ + runs: number; + /** If null, runs the default fixture (10-video-grid). */ + fixture: string | null; +}; + +const DEFAULT_FIXTURE = "10-video-grid"; +/** Targets are seconds within the composition (10s duration). */ +const SEEK_TARGETS: readonly number[] = [1.0, 7.0, 2.0, 8.0, 3.0, 9.0, 4.0, 6.0, 5.0, 0.5]; +/** + * Tolerance window the rAF watcher uses to decide that the iframe's reported + * `__player.getTime()` matches the requested seek target. 50ms = 1.5 frames at + * 30fps, which absorbs three sources of expected slippage: + * + * 1. **Frame quantization on the postMessage path.** `_sendControl("seek")` + * converts seconds → integer frame number → seconds inside the runtime, + * so e.g. a target of 1.0s on a 30fps composition lands at frame 30 → + * 1.000s exactly, but a target of 1.005s lands at frame 30 → still + * 1.000s, a 5ms quantization error baked into the API itself. + * 2. **Sub-frame intra-clip clock advance.** Even with the iframe paused, + * between the `seek()` call landing and the next rAF tick, the runtime + * may have already nudged time by a fraction of a frame as part of + * finalizing the seek; `getTime()` reports the post-finalize value. + * 3. **Variable host load + browser jitter on CI.** GitHub runners share + * cores, so a noisy neighbor can delay the rAF tick that would otherwise + * register the match by tens of ms. Picking a tolerance much tighter + * than this would gate against runner contention rather than player + * regressions. + * + * The metric this scenario asserts is *latency to user-visible match*, not + * *exact equality of the reported time*, so a 50ms acceptance window is the + * intended behavior — but if we ever want to tighten this (e.g. to assert + * sub-frame precision on the inline path now that PR #397 documented it), + * this is the knob to turn. Configurability is deliberately deferred until + * we have a concrete second use case; YAGNI. + * + * TODO(player-perf): revisit this constant after P0-1b lands and we have ~2 + * weeks of CI baseline data — if the inline-mode samples consistently cluster + * well below 50ms, drop this to e.g. 16ms (1 frame @ 60fps) and split the + * tolerance per mode (tighter for inline, current for isolated). + */ +const MATCH_TOLERANCE_S = 0.05; +/** Per-seek timeout; isolated p95 in the proposal is 80ms, so 1s is huge headroom. */ +const SEEK_TIMEOUT_MS = 1_000; +const PAUSE_CONFIRM_TIMEOUT_MS = 5_000; +const FRAME_LOOKUP_TIMEOUT_MS = 5_000; + +declare global { + interface Window { + /** Promise resolved by the iframe rAF watcher with the wall-clock t1 of the matching paint. */ + __perfScrubAwait?: Promise; + __player?: { + play: () => void; + pause: () => void; + seek: (timeSeconds: number) => void; + getTime: () => number; + getDuration: () => number; + isPlaying: () => boolean; + }; + } +} + +type Mode = "inline" | "isolated"; + +type RunResult = { + inlineLatencies: number[]; + isolatedLatencies: number[]; +}; + +/** + * Find the iframe Puppeteer Frame that hosts the fixture composition. Same + * helper as 02-fps.ts; duplicated locally so each scenario file is + * self-contained. + */ +async function getFixtureFrame(page: Page, fixture: string): Promise { + const expected = `/fixtures/${fixture}/`; + const deadline = Date.now() + FRAME_LOOKUP_TIMEOUT_MS; + while (Date.now() < deadline) { + const frame = page.frames().find((f) => f.url().includes(expected)); + if (frame) return frame; + await new Promise((r) => setTimeout(r, 50)); + } + throw new Error(`[scenario:scrub] fixture frame not found for "${fixture}" within timeout`); +} + +/** + * Measure a single seek's latency. + * + * Sequence: + * 1. Install a rAF watcher in the iframe that resolves with the wall-clock + * timestamp of the first paint where `__player.getTime()` is within + * tolerance of `target`. Promise is stashed on `window.__perfScrubAwait`. + * 2. Capture host wall-clock t0 and call `el.seek(target)` in the same task. + * 3. Await the iframe's resolved Promise (returns t1). + * 4. Latency = t1 - t0 (ms). + */ +async function measureSingleSeek(page: Page, frame: Frame, target: number): Promise { + await frame.evaluate( + (target: number, tolerance: number, timeoutMs: number) => { + window.__perfScrubAwait = new Promise((resolve, reject) => { + const deadlineWall = performance.timeOrigin + performance.now() + timeoutMs; + const tick = () => { + const wall = performance.timeOrigin + performance.now(); + const time = window.__player?.getTime?.() ?? Number.NaN; + if (Number.isFinite(time) && Math.abs(time - target) < tolerance) { + resolve(wall); + return; + } + if (wall > deadlineWall) { + reject(new Error(`[scrub] timeout target=${target} last=${time}`)); + return; + } + requestAnimationFrame(tick); + }; + requestAnimationFrame(tick); + }); + }, + target, + MATCH_TOLERANCE_S, + SEEK_TIMEOUT_MS, + ); + + const t0Wall = await page.evaluate((targetSeconds: number) => { + const el = document.getElementById("player") as + | (HTMLElement & { seek: (t: number) => void }) + | null; + if (!el) throw new Error("[scenario:scrub] player element missing on host page"); + const wall = performance.timeOrigin + performance.now(); + el.seek(targetSeconds); + return wall; + }, target); + + // Puppeteer awaits the Promise we stashed on window and returns its resolved value. + const t1Wall = (await frame.evaluate(() => window.__perfScrubAwait as Promise)) as number; + + return t1Wall - t0Wall; +} + +async function runScrubBatch( + page: Page, + frame: Frame, + mode: Mode, + idx: number, + total: number, +): Promise { + const latencies: number[] = []; + for (const target of SEEK_TARGETS) { + const latency = await measureSingleSeek(page, frame, target); + latencies.push(latency); + } + const p95 = percentile(latencies, 95); + console.log( + `[scenario:scrub] run[${idx + 1}/${total}] mode=${mode} p95=${p95.toFixed(2)}ms n=${latencies.length}`, + ); + return latencies; +} + +async function runOnce( + opts: ScrubScenarioOpts, + fixture: string, + idx: number, + total: number, +): Promise { + const ctx = await opts.browser.createBrowserContext(); + try { + const page = await ctx.newPage(); + const { duration } = await loadHostPage(page, opts.origin, { fixture }); + const requiredDuration = Math.max(...SEEK_TARGETS); + if (duration < requiredDuration) { + throw new Error( + `[scenario:scrub] fixture composition is ${duration.toFixed(2)}s but scrub targets require >= ${requiredDuration}s`, + ); + } + const frame = await getFixtureFrame(page, fixture); + + // Defensively pause: the host shell doesn't autoplay, but `pause()` also + // cancels any pending autoplay-on-ready behavior and guarantees the + // timeline isn't ticking under our seek measurements. + await page.evaluate(() => { + const el = document.getElementById("player") as (HTMLElement & { pause?: () => void }) | null; + el?.pause?.(); + }); + await frame.waitForFunction(() => window.__player?.isPlaying?.() === false, { + timeout: PAUSE_CONFIRM_TIMEOUT_MS, + }); + + // Inline mode first — the player's default `_trySyncSeek` path lands the + // seek synchronously when the iframe is same-origin (which it is here). + const inlineLatencies = await runScrubBatch(page, frame, "inline", idx, total); + + // Force isolated mode by shadowing `_trySyncSeek` on the instance with + // a function that always reports failure. The fallback in `seek()` then + // sends the seek through `_sendControl("seek", { frame })`, which is the + // same path a cross-origin embed (or a Phase 1 build without sync seek) + // would take. + await page.evaluate(() => { + const el = document.getElementById("player") as + | (HTMLElement & { _trySyncSeek?: (t: number) => boolean }) + | null; + if (!el) throw new Error("[scenario:scrub] player element missing on host page"); + el._trySyncSeek = () => false; + }); + + const isolatedLatencies = await runScrubBatch(page, frame, "isolated", idx, total); + + await page.close(); + return { inlineLatencies, isolatedLatencies }; + } finally { + await ctx.close(); + } +} + +export async function runScrub(opts: ScrubScenarioOpts): Promise { + const fixture = opts.fixture ?? DEFAULT_FIXTURE; + const runs = Math.max(1, opts.runs); + console.log( + `[scenario:scrub] fixture=${fixture} runs=${runs} seeks_per_mode=${SEEK_TARGETS.length} tolerance=${(MATCH_TOLERANCE_S * 1000).toFixed(0)}ms`, + ); + + const allInline: number[] = []; + const allIsolated: number[] = []; + for (let i = 0; i < runs; i++) { + const result = await runOnce(opts, fixture, i, runs); + allInline.push(...result.inlineLatencies); + allIsolated.push(...result.isolatedLatencies); + } + + const inlineP95 = percentile(allInline, 95); + const isolatedP95 = percentile(allIsolated, 95); + console.log( + `[scenario:scrub] aggregate inline_p95=${inlineP95.toFixed(2)}ms isolated_p95=${isolatedP95.toFixed(2)}ms (runs=${runs} samples_per_mode=${allInline.length})`, + ); + + return [ + { + name: "scrub_latency_p95_inline_ms", + baselineKey: "scrubLatencyP95InlineMs", + value: inlineP95, + unit: "ms", + direction: "lower-is-better", + samples: allInline, + }, + { + name: "scrub_latency_p95_isolated_ms", + baselineKey: "scrubLatencyP95IsolatedMs", + value: isolatedP95, + unit: "ms", + direction: "lower-is-better", + samples: allIsolated, + }, + ]; +} diff --git a/packages/player/tests/perf/scenarios/05-drift.ts b/packages/player/tests/perf/scenarios/05-drift.ts new file mode 100644 index 000000000..1fdd64a5c --- /dev/null +++ b/packages/player/tests/perf/scenarios/05-drift.ts @@ -0,0 +1,307 @@ +/** + * Scenario 05: media sync drift. + * + * Loads the 10-video-grid fixture, starts playback, and uses + * `requestVideoFrameCallback` on every video element to record + * (compositionTime, actualMediaTime) pairs for each decoded frame. Drift is + * the absolute difference between the *expected* media time (derived from the + * composition time using the runtime's clip transform) and the actual media + * time the decoder presented to the compositor. + * + * Per the proposal: + * Test 4: Media sync drift (player-perf-drift) + * Load 5-video composition → play for 10 seconds → on each RVFC callback, + * record drift between expected and actual media time + * Assert: max drift < 500ms, p95 drift < 100ms + * + * Methodology details: + * - We instrument *every* `video[data-start]` element in the fixture. The + * proposal called for 5 videos; the 10-video-grid gives us 10 streams in + * the same composition, which is a more conservative regression signal. + * - The expected media time uses the same transform the runtime applies in + * packages/core/src/runtime/media.ts: + * + * expectedMediaTime = (compositionTime - clip.start) * clip.playbackRate + * + clip.mediaStart + * + * We snapshot `clip.start` / `clip.mediaStart` / `clip.playbackRate` from + * each element's dataset + `defaultPlaybackRate` once when the sampler is + * installed, so the per-frame work is just a subtract + multiply + abs. + * - The runtime's media sync runs on a 50ms `setInterval`. Between syncs the + * video element's clock free-runs. The drift we measure here is the + * residual after that 50ms loop catches up — i.e. the user-visible glitch + * budget. The runtime hard-resyncs when |currentTime - relTime| > 0.5s + * (see media.ts), which is exactly the proposal's max-drift ceiling: a + * regression past 500ms means the corrective resync kicked in and the + * viewer saw a jump. + * - We install RVFC *before* calling play(), then reset the sample buffer + * once `__player.isPlaying()` flips true. Frames captured during the + * postMessage round-trip would compare a non-zero mediaTime against + * `getTime() === 0` and inflate drift to several hundred ms — same gotcha + * as 02-fps.ts. + * - Sustain window is 6s instead of the proposal's 10s because the fixture + * composition is exactly 10s long, and we want headroom before the + * end-of-timeline pause/clamp behavior. With 10 videos × ~25fps × 6s we + * still pool ~1500 samples per run, more than enough for a stable p95. + * + * Outputs two metrics: + * - media_drift_max_ms (lower-is-better, baseline driftMaxMs) + * - media_drift_p95_ms (lower-is-better, baseline driftP95Ms) + * + * Aggregation: max() and percentile(95) across the pooled per-frame drifts + * from every video in every run. + */ + +import type { Browser, Frame, Page } from "puppeteer-core"; +import { loadHostPage, percentile } from "../runner.ts"; +import type { Metric } from "../perf-gate.ts"; + +export type DriftScenarioOpts = { + browser: Browser; + origin: string; + /** Number of measurement runs. */ + runs: number; + /** If null, runs the default fixture (10-video-grid). */ + fixture: string | null; +}; + +const DEFAULT_FIXTURE = "10-video-grid"; +const PLAYBACK_DURATION_MS = 6_000; +const PLAY_CONFIRM_TIMEOUT_MS = 5_000; +const FRAME_LOOKUP_TIMEOUT_MS = 5_000; + +type DriftSample = { + compTime: number; + actualMediaTime: number; + clipStart: number; + clipMediaStart: number; + clipPlaybackRate: number; +}; + +declare global { + interface Window { + /** RVFC samples collected by the iframe-side observer. */ + __perfDriftSamples?: DriftSample[]; + /** Set to false to stop sampling at the end of the measurement window. */ + __perfDriftActive?: boolean; + __player?: { + play: () => void; + pause: () => void; + seek: (timeSeconds: number) => void; + getTime: () => number; + getDuration: () => number; + isPlaying: () => boolean; + }; + } +} + +type RunResult = { + drifts: number[]; + videoCount: number; +}; + +/** + * Find the iframe Puppeteer Frame that hosts the fixture composition. Same + * helper as the other scenarios; duplicated locally so each scenario file is + * self-contained. + */ +async function getFixtureFrame(page: Page, fixture: string): Promise { + const expected = `/fixtures/${fixture}/`; + const deadline = Date.now() + FRAME_LOOKUP_TIMEOUT_MS; + while (Date.now() < deadline) { + const frame = page.frames().find((f) => f.url().includes(expected)); + if (frame) return frame; + await new Promise((r) => setTimeout(r, 50)); + } + throw new Error(`[scenario:drift] fixture frame not found for "${fixture}" within timeout`); +} + +async function runOnce( + opts: DriftScenarioOpts, + fixture: string, + idx: number, + total: number, +): Promise { + const ctx = await opts.browser.createBrowserContext(); + try { + const page = await ctx.newPage(); + const { duration } = await loadHostPage(page, opts.origin, { fixture }); + const requiredDurationSec = PLAYBACK_DURATION_MS / 1000; + if (duration < requiredDurationSec) { + throw new Error( + `[scenario:drift] fixture composition is ${duration.toFixed(2)}s but drift sample window needs >= ${requiredDurationSec.toFixed(0)}s`, + ); + } + const frame = await getFixtureFrame(page, fixture); + + // Install RVFC on every `video[data-start]` element in the iframe. Each + // callback records the wall-clock-aligned (compositionTime, mediaTime) + // pair plus a snapshot of the clip transform so we can compute drift in + // node without re-querying the dataset on every frame. + const videoCount = (await frame.evaluate(() => { + window.__perfDriftSamples = []; + window.__perfDriftActive = true; + const videos = Array.from(document.querySelectorAll("video[data-start]")); + type RvfcMetadata = { mediaTime: number; presentationTime: number }; + type RvfcVideo = HTMLVideoElement & { + requestVideoFrameCallback?: ( + cb: (now: DOMHighResTimeStamp, metadata: RvfcMetadata) => void, + ) => number; + }; + let installed = 0; + for (const video of videos) { + const rvfcVideo = video as RvfcVideo; + const rvfc = rvfcVideo.requestVideoFrameCallback; + // Headless Chrome supports RVFC; bail quietly on browsers that don't. + if (!rvfc) continue; + const clipStart = Number.parseFloat(video.dataset.start ?? "0") || 0; + const clipMediaStart = + Number.parseFloat(video.dataset.playbackStart ?? video.dataset.mediaStart ?? "0") || 0; + const rawRate = video.defaultPlaybackRate; + const clipPlaybackRate = + Number.isFinite(rawRate) && rawRate > 0 ? Math.max(0.1, Math.min(5, rawRate)) : 1; + const tick = (_now: DOMHighResTimeStamp, metadata: RvfcMetadata) => { + if (!window.__perfDriftActive) return; + const compTime = window.__player?.getTime?.() ?? Number.NaN; + if (Number.isFinite(compTime)) { + window.__perfDriftSamples!.push({ + compTime, + actualMediaTime: metadata.mediaTime, + clipStart, + clipMediaStart, + clipPlaybackRate, + }); + } + rvfc.call(video, tick); + }; + rvfc.call(video, tick); + installed++; + } + return installed; + })) as number; + + if (videoCount === 0) { + throw new Error(`[scenario:drift] fixture ${fixture} contains no video[data-start] elements`); + } + + // Issue play from the host page; the player posts a control message into + // the iframe and the runtime starts the 50ms media sync poll. + await page.evaluate(() => { + const el = document.getElementById("player") as (HTMLElement & { play: () => void }) | null; + if (!el) throw new Error("[scenario:drift] player element missing on host page"); + el.play(); + }); + + // Wait for the runtime to confirm playing before we trust the samples. + await frame.waitForFunction(() => window.__player?.isPlaying?.() === true, { + timeout: PLAY_CONFIRM_TIMEOUT_MS, + }); + + // Reset the buffer now that playback is live. Anything captured during + // the postMessage round-trip would compare a non-zero mediaTime against + // `getTime() === 0` and bias drift up by hundreds of ms. + await frame.evaluate(() => { + window.__perfDriftSamples = []; + }); + + await new Promise((r) => setTimeout(r, PLAYBACK_DURATION_MS)); + + // Stop sampling first, then pause. Same ordering as 02-fps.ts so the + // pause command can't perturb the tail of the measurement window. + const samples = (await frame.evaluate(() => { + window.__perfDriftActive = false; + return window.__perfDriftSamples ?? []; + })) as DriftSample[]; + + await page.evaluate(() => { + const el = document.getElementById("player") as (HTMLElement & { pause: () => void }) | null; + el?.pause(); + }); + + if (samples.length === 0) { + throw new Error( + `[scenario:drift] run ${idx + 1}/${total}: zero RVFC samples captured (videos=${videoCount}, duration=${duration.toFixed(2)}s)`, + ); + } + + // Apply the runtime's transform to derive the expected media time, then + // compare against the actual media time the decoder presented. Convert + // to ms here so the gate threshold (driftMaxMs / driftP95Ms) compares + // apples-to-apples. + const drifts: number[] = []; + for (const s of samples) { + const expectedMediaTime = (s.compTime - s.clipStart) * s.clipPlaybackRate + s.clipMediaStart; + const driftMs = Math.abs(s.actualMediaTime - expectedMediaTime) * 1000; + drifts.push(driftMs); + } + + const max = Math.max(...drifts); + const p95 = percentile(drifts, 95); + console.log( + `[scenario:drift] run[${idx + 1}/${total}] max=${max.toFixed(2)}ms p95=${p95.toFixed(2)}ms videos=${videoCount} samples=${samples.length}`, + ); + + await page.close(); + return { drifts, videoCount }; + } finally { + await ctx.close(); + } +} + +export async function runDrift(opts: DriftScenarioOpts): Promise { + const fixture = opts.fixture ?? DEFAULT_FIXTURE; + const runs = Math.max(1, opts.runs); + console.log(`[scenario:drift] fixture=${fixture} runs=${runs} window=${PLAYBACK_DURATION_MS}ms`); + + const allDrifts: number[] = []; + let lastVideoCount = 0; + for (let i = 0; i < runs; i++) { + const result = await runOnce(opts, fixture, i, runs); + allDrifts.push(...result.drifts); + lastVideoCount = result.videoCount; + } + + // Worst case wins for max; p95 is computed across the pooled per-frame + // drifts from every video in every run. The proposal asserts max < 500ms + // and p95 < 100ms, so a single bad sample legitimately gates the build. + const maxDrift = Math.max(...allDrifts); + const p95Drift = percentile(allDrifts, 95); + // Coefficient of variation (stddev / mean) is logged here as a soft signal + // we can eyeball in CI output. We deliberately do NOT gate on it — the + // baseline asserts absolute thresholds (max, p95), and the underlying + // distribution is heavy-tailed (most frames are sub-50ms, occasional ones + // spike during the 50ms media-sync interval). But CV is a useful early + // warning: if it climbs significantly across CI runs while max + p95 stay + // green, our jitter assumptions about the runtime's resync loop have + // shifted (e.g. if media.ts changes its 50ms `setInterval` cadence) and + // we should revisit the baselines before they start producing flakes. + // TODO(player-perf): once we have ~2 weeks of CI baseline data, decide + // whether to publish CV as a tracked-but-ungated metric in baseline.json + // alongside max + p95, or wire it into the Slack regression report. + const meanDrift = allDrifts.reduce((a, b) => a + b, 0) / allDrifts.length; + const variance = allDrifts.reduce((acc, d) => acc + (d - meanDrift) ** 2, 0) / allDrifts.length; + const stddev = Math.sqrt(variance); + const cv = meanDrift > 0 ? stddev / meanDrift : 0; + console.log( + `[scenario:drift] aggregate max=${maxDrift.toFixed(2)}ms p95=${p95Drift.toFixed(2)}ms mean=${meanDrift.toFixed(2)}ms cv=${cv.toFixed(3)} videos=${lastVideoCount} samples=${allDrifts.length} runs=${runs}`, + ); + + return [ + { + name: "media_drift_max_ms", + baselineKey: "driftMaxMs", + value: maxDrift, + unit: "ms", + direction: "lower-is-better", + samples: allDrifts, + }, + { + name: "media_drift_p95_ms", + baselineKey: "driftP95Ms", + value: p95Drift, + unit: "ms", + direction: "lower-is-better", + samples: allDrifts, + }, + ]; +}