Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions src/inference_endpoint/commands/benchmark/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -965,16 +965,16 @@ def _on_phase_start(phase: PhaseConfig) -> None:

if snap_dict is not None:
try:
runtime = ctx.config.settings.runtime
warmup = ctx.config.settings.warmup
report = Report.from_snapshot(
snap_dict,
seeds={
"scheduler_random_seed": runtime.scheduler_random_seed,
"dataloader_random_seed": runtime.dataloader_random_seed,
"warmup_random_seed": warmup.warmup_random_seed,
},
# Snapshot the run's load/runtime config straight from the
# Pydantic settings (single source of truth) — captures RNG
# seeds, load pattern, sample count, durations, workers, etc.
# without hand-listing fields. endpoint_config (api_key/URLs)
# is a sibling of settings, so it is not included.
run_config = ctx.config.settings.model_dump(
mode="json",
include={"runtime", "load_pattern", "client", "warmup"},
)
report = Report.from_snapshot(snap_dict, run_config=run_config)
if not report.complete:
logger.warning(
"Report is incomplete (state=%s, n_pending_tasks=%d)",
Expand Down
33 changes: 20 additions & 13 deletions src/inference_endpoint/metrics/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,21 +138,23 @@ class Report(msgspec.Struct, frozen=True): # type: ignore[call-arg]
qps: float | None = None
tps: float | None = None

# RNG seeds for this run (scheduler/dataloader/warmup, from config). Carried
# so result_summary.json is self-validating: a reproducible run is identified
# by its seeds. These are config, not a measured metric, so the from_snapshot
# caller supplies them rather than reading them from the metrics snapshot.
seeds: dict[str, int] | None = None
# Run configuration (load_pattern, runtime incl. RNG seeds, client, warmup),
# from config. Carried so result_summary.json is self-describing and a
# reproducible run is identified by its settings. Config, not a measured
# metric, so the from_snapshot caller supplies it (a config model_dump) rather
# than reading it from the metrics snapshot.
run_config: dict[str, Any] | None = None

@classmethod
def from_snapshot(
cls, snap: dict[str, Any], *, seeds: dict[str, int] | None = None
cls, snap: dict[str, Any], *, run_config: dict[str, Any] | None = None
) -> Report:
"""Build a Report from a snapshot dict.

``seeds`` (optional) carries the run's RNG seeds from config into the
report so result_summary.json is self-validating; it is keyword-only
because it is config, not part of the metrics snapshot.
``run_config`` (optional, keyword-only) carries the run's configuration
(a config ``model_dump``: load_pattern, runtime incl. seeds, client,
warmup) into the report so result_summary.json is self-describing; it is
config, not part of the metrics snapshot.

Input is the dict form produced by
``inference_endpoint.async_utils.services.metrics_aggregator.snapshot
Expand Down Expand Up @@ -228,7 +230,7 @@ def _series_dict(key: str) -> dict[str, Any]:
output_sequence_lengths=osl,
qps=qps,
tps=tps,
seeds=seeds,
run_config=run_config,
)

def to_json(self, save_to: os.PathLike | None = None) -> bytes:
Expand Down Expand Up @@ -258,9 +260,14 @@ def display(
fn(f"Version: {self.version}{newline}")
if self.git_sha:
fn(f"Git SHA: {self.git_sha}{newline}")
if self.seeds:
seed_str = ", ".join(f"{k}={v}" for k, v in self.seeds.items())
fn(f"Seeds: {seed_str}{newline}")
if self.run_config:
fn(f"Run config:{newline}")
for section, params in self.run_config.items():
if isinstance(params, dict):
inner = ", ".join(f"{k}={v}" for k, v in params.items())
fn(f" {section}: {inner}{newline}")
else:
fn(f" {section}: {params}{newline}")
if self.test_started_at > 0:
approx = monotime_to_datetime(self.test_started_at)
fn(f"Test started at: {approx.strftime('%Y-%m-%d %H:%M:%S')}{newline}")
Expand Down
35 changes: 21 additions & 14 deletions tests/unit/metrics/test_report_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,16 +168,19 @@ def test_with_metrics(self):
# OSL data was written → tps is computable.
assert report.tps is not None

def test_seeds_keyword_only_passthrough(self):
"""seeds is config, not a snapshot metric: None unless the caller
def test_run_config_keyword_only_passthrough(self):
"""run_config is config, not a snapshot metric: None unless the caller
supplies it, and carried verbatim into the report when it does."""
registry = _make_registry(n_samples=5)
snap = snapshot_to_dict(
registry.build_snapshot(state=SessionState.COMPLETE, n_pending_tasks=0)
)
assert Report.from_snapshot(snap).seeds is None
seeds = {"scheduler_random_seed": 42, "dataloader_random_seed": 7}
assert Report.from_snapshot(snap, seeds=seeds).seeds == seeds
assert Report.from_snapshot(snap).run_config is None
run_config = {
"runtime": {"scheduler_random_seed": 42, "n_samples_to_issue": 100},
"load_pattern": {"type": "poisson", "target_qps": 14.75},
}
assert Report.from_snapshot(snap, run_config=run_config).run_config == run_config

def test_failed_uses_tracked_counter(self):
"""``n_samples_failed`` reads from ``tracked_samples_failed``, not
Expand Down Expand Up @@ -266,23 +269,27 @@ def test_to_json_qps_tps_null_without_duration(self):
assert data["qps"] is None
assert data["tps"] is None

def test_to_json_serializes_seeds(self):
"""result_summary.json carries the run's RNG seeds so a run can be
validated as reproducible; absent seeds serialize as null."""
def test_to_json_and_display_carry_run_config(self):
"""result_summary.json + report.txt carry the run's config so a run is
self-describing/reproducible; absent run_config serializes as null."""
registry = _make_registry(n_samples=5)
snap = snapshot_to_dict(
registry.build_snapshot(state=SessionState.COMPLETE, n_pending_tasks=0)
)
seeds = {"scheduler_random_seed": 42, "dataloader_random_seed": 42}
report = Report.from_snapshot(snap, seeds=seeds)
assert json.loads(report.to_json())["seeds"] == seeds
run_config = {
"load_pattern": {"type": "poisson", "target_qps": 14.75},
"runtime": {"scheduler_random_seed": 42},
}
report = Report.from_snapshot(snap, run_config=run_config)
assert json.loads(report.to_json())["run_config"] == run_config

lines: list[str] = []
report.display(fn=lines.append, summary_only=True)
assert any("Seeds:" in ln for ln in lines)
assert any("Run config:" in ln for ln in lines)
assert any("load_pattern:" in ln and "poisson" in ln for ln in lines)

# Absent seeds -> null, not omitted.
assert json.loads(Report.from_snapshot(snap).to_json())["seeds"] is None
# Absent run_config -> null, not omitted.
assert json.loads(Report.from_snapshot(snap).to_json())["run_config"] is None

def test_to_json_save(self, tmp_path: Path):
registry = _make_registry(n_samples=5)
Expand Down
Loading