NVIDIA · 2ez4bz · May 7, 2026 · Apr 28, 2026
diff --git a/constraints.txt b/constraints.txt
@@ -8,3 +8,6 @@ tornado>=6.5.5
 black>=26.3.1
 # Upgrade base image nvidia-cutlass-dsl 4.3.5 to 4.4.2
 nvidia-cutlass-dsl>=4.4.2
+# The `nvidia-cutlass-dsl` package does not pin numpy at all, which can be problematic in certain CI
+# stages.
+numpy>=2.0.0,<2.4 # numba 0.63.1 requires numpy<2.4
@@ -54,8 +54,8 @@ RUN --mount=type=bind,source=docker/common,target=/opt/docker/common \
 # Install constraints after install.sh so cleanup() doesn't delete the file mid-RUN
 COPY constraints.txt /tmp/constraints.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    # WAR: uninstall dependencies that has vulnerability or need upgrading
-    pip3 uninstall -y tornado black nbconvert pillow nvidia-cutlass-dsl nvidia-cutlass-dsl-libs-base || true && \
+    # WAR: uninstall dependencies that has vulnerability or need upgrading.
+    pip3 uninstall -y tornado black nbconvert pillow nvidia-cutlass-dsl nvidia-cutlass-dsl-libs-base numpy || true && \
     # Remove any leftover namespace dirs or dist-info that pip missed
     rm -rf $(python3 -c "import site; print(site.getsitepackages()[0])")/nvidia_cutlass_dsl* && \
     pip3 install --ignore-installed -r /tmp/constraints.txt && \

@@ -13,7 +13,7 @@
 #     images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
 IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
 
-LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-x86_64-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202604200956-13064
-LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-sbsa-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202604200956-13064
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.15.1.29-skip-tritondevel-202604200956-13064
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.15.1.29-skip-tritondevel-202604200956-13064
+LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-x86_64-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202605060827-13616
+LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-sbsa-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202605060827-13616
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.15.1.29-skip-tritondevel-202605060827-13616
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.15.1.29-skip-tritondevel-202605060827-13616
diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py
@@ -193,12 +193,19 @@ def __init__(self,
             chat_template_kwargs: Chat template kwargs as JSON string
             output_dir: Directory to save the task infos.
         """
-        super().__init__(llm, sampling_params, streaming, output_dir=output_dir)
+        super().__init__(
+            llm,
+            sampling_params=sampling_params,
+            streaming=streaming,
+            chat_template_kwargs=chat_template_kwargs,
+            model_type=model_type,
+            is_force_single_image=is_force_single_image,
+            output_dir=output_dir,
+        )
 
         # NOTE: Required by lm_eval to identify this as a multimodal model
         self.MULTIMODAL = True
         self.max_images = max_images
-        self.chat_template_kwargs = chat_template_kwargs
         self.model_type = model_type if model_type is not None else self._get_model_type(
             llm)
         self.is_force_single_image = is_force_single_image

@@ -30,11 +30,22 @@ Efficient-Large-Model/NVILA-8B:
   - accuracy: 47.77
 Efficient-Large-Model/VILA1.5-3b:
   - accuracy: 32.33
-# MMMU for Nemotron-Nano-12B-v2-VL-BF16 requires reasoning on.
-# While enabling reasoning for current test harness is not supported,
-# the metric here is for model sanity checking.
+# MMMU for Nemotron-Nano-* models require reasoning on.
+# While enabling reasoning for current test harness is not supported, the metric here is for model
+# regression testing, not an indicator of official benchmark results.
+# See code comments in `test_llm_api_pytorch_multimodal.py` for more details.
 nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16:
   - accuracy: 26.67
+nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16:
+  - accuracy: 39.0
+nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8:
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 39.0
+nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4:
+  - quant_algo: MIXED_PRECISION
+    kv_cache_quant_algo: FP8
+    accuracy: 38.0
 microsoft/Phi-4-multimodal-instruct:
   - accuracy: 53.67
 Qwen/Qwen3-VL-30B-A3B-Instruct:

@@ -464,3 +464,102 @@ def test_auto_dtype(self, max_num_tokens):
         ) as llm:
             task = MMMU(self.MODEL_NAME)
             task.evaluate(llm, sampling_params=self.sampling_params)
+
+
+class TestNanoV3Omni(LlmapiAccuracyTestHarness):
+    # The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
+    # lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
+    # uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
+    # explicit image tiling/token accounting in the Mcore wrapper.
+    # We also keep the generation budget small for CI speed, and this evaluator
+    # does not strip reasoning traces after </think> before scoring. If the model
+    # ignores the non-thinking directive, answer extraction may see the reasoning.
+    EXTRA_EVALUATOR_KWARGS = dict(
+        apply_chat_template=True,
+        is_multimodal=True,
+    )
+
+    # NOTE: MMMU adds <|endoftext|> to the stop token.
+    sampling_params = SamplingParams(
+        max_tokens=MMMU.MAX_OUTPUT_LEN,
+        truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
+        stop="<|endoftext|>",
+        temperature=0.0,
+        top_k=1,
+    )
+
+    @pytest.mark.skip_less_device_memory(80000)
+    @pytest.mark.parametrize(
+        "model_name,model_path,kv_cache_config,max_batch_size,expected_quant_algo",
+        [
+            pytest.param(
+                "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16",
+                f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16",
+                KvCacheConfig(
+                    free_gpu_memory_fraction=0.8,
+                    mamba_ssm_cache_dtype="float32",
+                    enable_block_reuse=False,
+                ),
+                32,
+                None,
+                id="bf16",
+            ),
+            pytest.param(
+                "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8",
+                f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8",
+                KvCacheConfig(
+                    free_gpu_memory_fraction=0.8,
+                    mamba_ssm_cache_dtype="float32",
+                    enable_block_reuse=False,
+                    dtype="fp8",
+                ),
+                64,
+                QuantAlgo.FP8,
+                marks=skip_pre_hopper,
+                id="fp8",
+            ),
+            pytest.param(
+                "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
+                f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
+                KvCacheConfig(
+                    free_gpu_memory_fraction=0.8,
+                    mamba_ssm_cache_dtype="float32",
+                    enable_block_reuse=False,
+                    dtype="fp8",
+                ),
+                128,
+                QuantAlgo.MIXED_PRECISION,
+                marks=skip_pre_blackwell,
+                id="nvfp4",
+            ),
+        ],
+    )
+    def test_auto_dtype(
+        self,
+        model_name: str,
+        model_path: str,
+        kv_cache_config: KvCacheConfig,
+        max_batch_size: int,
+        expected_quant_algo: QuantAlgo | None,
+    ) -> None:
+        task = MMMU(model_name)
+
+        with LLM(
+            model_path,
+            kv_cache_config=kv_cache_config,
+            enable_chunked_prefill=True,
+            # Keep the integration test fast; full benchmark-style runs may use
+            # a larger generation budget.
+            max_num_tokens=512,
+            # The amount of memory pre-allocated for mamba SSM states is proportional to the below,
+            # so lower it from its default of 2048.
+            # Quantized variants fit larger batches within the CI GPU memory budget.
+            max_batch_size=max_batch_size,
+        ) as llm:
+            if expected_quant_algo is not None:
+                assert llm.args.quant_config.quant_algo == expected_quant_algo
+            task.evaluate(
+                llm,
+                sampling_params=self.sampling_params,
+                extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS,
+            )
@@ -773,6 +773,9 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_pr
 accuracy/test_llm_api_pytorch_multimodal.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm]
 accuracy/test_llm_api_pytorch_multimodal.py::TestMistralSmall24B::test_auto_dtype[forced_chunked_prefill]
 accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype[forced_chunked_prefill]
+accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[bf16]
+accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[fp8]
+accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4]
 accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype

diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -72,6 +72,7 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8[enable_block_reuse=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8[enable_block_reuse=True]
   - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1_block_reuse-cutlass]
+  - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4]
   - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
@@ -284,6 +285,7 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-auto]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16
+  - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[bf16]
 # ------------- AutoDeploy Backend Stages ---------------
 - condition:
     ranges:

diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -119,6 +119,7 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=fp8-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_dummy_load_format
   - accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
+  - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[fp8]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_dummy_load_format
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]