mlcommons · nvzhihanj · Jun 27, 2026
@@ -965,16 +965,16 @@ def _on_phase_start(phase: PhaseConfig) -> None:
 
             if snap_dict is not None:
                 try:
-                    runtime = ctx.config.settings.runtime
-                    warmup = ctx.config.settings.warmup
-                    report = Report.from_snapshot(
-                        snap_dict,
-                        seeds={
-                            "scheduler_random_seed": runtime.scheduler_random_seed,
-                            "dataloader_random_seed": runtime.dataloader_random_seed,
-                            "warmup_random_seed": warmup.warmup_random_seed,
-                        },
+                    # Snapshot the run's load/runtime config straight from the
+                    # Pydantic settings (single source of truth) — captures RNG
+                    # seeds, load pattern, sample count, durations, workers, etc.
+                    # without hand-listing fields. endpoint_config (api_key/URLs)
+                    # is a sibling of settings, so it is not included.
+                    run_config = ctx.config.settings.model_dump(
+                        mode="json",
+                        include={"runtime", "load_pattern", "client", "warmup"},
                     )
+                    report = Report.from_snapshot(snap_dict, run_config=run_config)
                     if not report.complete:
                         logger.warning(
                             "Report is incomplete (state=%s, n_pending_tasks=%d)",

@@ -138,21 +138,23 @@ class Report(msgspec.Struct, frozen=True):  # type: ignore[call-arg]
     qps: float | None = None
     tps: float | None = None
 
-    # RNG seeds for this run (scheduler/dataloader/warmup, from config). Carried
-    # so result_summary.json is self-validating: a reproducible run is identified
-    # by its seeds. These are config, not a measured metric, so the from_snapshot
-    # caller supplies them rather than reading them from the metrics snapshot.
-    seeds: dict[str, int] | None = None
+    # Run configuration (load_pattern, runtime incl. RNG seeds, client, warmup),
+    # from config. Carried so result_summary.json is self-describing and a
+    # reproducible run is identified by its settings. Config, not a measured
+    # metric, so the from_snapshot caller supplies it (a config model_dump) rather
+    # than reading it from the metrics snapshot.
+    run_config: dict[str, Any] | None = None
 
     @classmethod
     def from_snapshot(
-        cls, snap: dict[str, Any], *, seeds: dict[str, int] | None = None
+        cls, snap: dict[str, Any], *, run_config: dict[str, Any] | None = None
     ) -> Report:
         """Build a Report from a snapshot dict.
 
-        ``seeds`` (optional) carries the run's RNG seeds from config into the
-        report so result_summary.json is self-validating; it is keyword-only
-        because it is config, not part of the metrics snapshot.
+        ``run_config`` (optional, keyword-only) carries the run's configuration
+        (a config ``model_dump``: load_pattern, runtime incl. seeds, client,
+        warmup) into the report so result_summary.json is self-describing; it is
+        config, not part of the metrics snapshot.
 
         Input is the dict form produced by
         ``inference_endpoint.async_utils.services.metrics_aggregator.snapshot
@@ -228,7 +230,7 @@ def _series_dict(key: str) -> dict[str, Any]:
             output_sequence_lengths=osl,
             qps=qps,
             tps=tps,
-            seeds=seeds,
+            run_config=run_config,
         )
 
     def to_json(self, save_to: os.PathLike | None = None) -> bytes:
@@ -258,9 +260,14 @@ def display(
         fn(f"Version: {self.version}{newline}")
         if self.git_sha:
             fn(f"Git SHA: {self.git_sha}{newline}")
-        if self.seeds:
-            seed_str = ", ".join(f"{k}={v}" for k, v in self.seeds.items())
-            fn(f"Seeds: {seed_str}{newline}")
+        if self.run_config:
+            fn(f"Run config:{newline}")
+            for section, params in self.run_config.items():
+                if isinstance(params, dict):
+                    inner = ", ".join(f"{k}={v}" for k, v in params.items())
+                    fn(f"  {section}: {inner}{newline}")
+                else:
+                    fn(f"  {section}: {params}{newline}")
         if self.test_started_at > 0:
             approx = monotime_to_datetime(self.test_started_at)
             fn(f"Test started at: {approx.strftime('%Y-%m-%d %H:%M:%S')}{newline}")

@@ -168,16 +168,19 @@ def test_with_metrics(self):
         # OSL data was written → tps is computable.
         assert report.tps is not None
 
-    def test_seeds_keyword_only_passthrough(self):
-        """seeds is config, not a snapshot metric: None unless the caller
+    def test_run_config_keyword_only_passthrough(self):
+        """run_config is config, not a snapshot metric: None unless the caller
         supplies it, and carried verbatim into the report when it does."""
         registry = _make_registry(n_samples=5)
         snap = snapshot_to_dict(
             registry.build_snapshot(state=SessionState.COMPLETE, n_pending_tasks=0)
         )
-        assert Report.from_snapshot(snap).seeds is None
-        seeds = {"scheduler_random_seed": 42, "dataloader_random_seed": 7}
-        assert Report.from_snapshot(snap, seeds=seeds).seeds == seeds
+        assert Report.from_snapshot(snap).run_config is None
+        run_config = {
+            "runtime": {"scheduler_random_seed": 42, "n_samples_to_issue": 100},
+            "load_pattern": {"type": "poisson", "target_qps": 14.75},
+        }
+        assert Report.from_snapshot(snap, run_config=run_config).run_config == run_config
 
     def test_failed_uses_tracked_counter(self):
         """``n_samples_failed`` reads from ``tracked_samples_failed``, not
@@ -266,23 +269,27 @@ def test_to_json_qps_tps_null_without_duration(self):
         assert data["qps"] is None
         assert data["tps"] is None
 
-    def test_to_json_serializes_seeds(self):
-        """result_summary.json carries the run's RNG seeds so a run can be
-        validated as reproducible; absent seeds serialize as null."""
+    def test_to_json_and_display_carry_run_config(self):
+        """result_summary.json + report.txt carry the run's config so a run is
+        self-describing/reproducible; absent run_config serializes as null."""
         registry = _make_registry(n_samples=5)
         snap = snapshot_to_dict(
             registry.build_snapshot(state=SessionState.COMPLETE, n_pending_tasks=0)
         )
-        seeds = {"scheduler_random_seed": 42, "dataloader_random_seed": 42}
-        report = Report.from_snapshot(snap, seeds=seeds)
-        assert json.loads(report.to_json())["seeds"] == seeds
+        run_config = {
+            "load_pattern": {"type": "poisson", "target_qps": 14.75},
+            "runtime": {"scheduler_random_seed": 42},
+        }
+        report = Report.from_snapshot(snap, run_config=run_config)
+        assert json.loads(report.to_json())["run_config"] == run_config
 
         lines: list[str] = []
         report.display(fn=lines.append, summary_only=True)
-        assert any("Seeds:" in ln for ln in lines)
+        assert any("Run config:" in ln for ln in lines)
+        assert any("load_pattern:" in ln and "poisson" in ln for ln in lines)
 
-        # Absent seeds -> null, not omitted.
-        assert json.loads(Report.from_snapshot(snap).to_json())["seeds"] is None
+        # Absent run_config -> null, not omitted.
+        assert json.loads(Report.from_snapshot(snap).to_json())["run_config"] is None
 
     def test_to_json_save(self, tmp_path: Path):
         registry = _make_registry(n_samples=5)