Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions constraints.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@ tornado>=6.5.5
black>=26.3.1
# Upgrade base image nvidia-cutlass-dsl 4.3.5 to 4.4.2
nvidia-cutlass-dsl>=4.4.2
# The `nvidia-cutlass-dsl` package does not pin numpy at all, which can be problematic in certain CI
# stages.
numpy>=2.0.0,<2.4 # numba 0.63.1 requires numpy<2.4
4 changes: 2 additions & 2 deletions docker/Dockerfile.multi
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ RUN --mount=type=bind,source=docker/common,target=/opt/docker/common \
# Install constraints after install.sh so cleanup() doesn't delete the file mid-RUN
COPY constraints.txt /tmp/constraints.txt
RUN --mount=type=cache,target=/root/.cache/pip \
# WAR: uninstall dependencies that has vulnerability or need upgrading
pip3 uninstall -y tornado black nbconvert pillow nvidia-cutlass-dsl nvidia-cutlass-dsl-libs-base || true && \
# WAR: uninstall dependencies that has vulnerability or need upgrading.
pip3 uninstall -y tornado black nbconvert pillow nvidia-cutlass-dsl nvidia-cutlass-dsl-libs-base numpy || true && \
# Remove any leftover namespace dirs or dist-info that pip missed
rm -rf $(python3 -c "import site; print(site.getsitepackages()[0])")/nvidia_cutlass_dsl* && \
pip3 install --ignore-installed -r /tmp/constraints.txt && \
Expand Down
8 changes: 4 additions & 4 deletions jenkins/current_image_tags.properties
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm

LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-x86_64-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202604200956-13064
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-sbsa-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202604200956-13064
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.15.1.29-skip-tritondevel-202604200956-13064
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.15.1.29-skip-tritondevel-202604200956-13064
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-x86_64-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202605060827-13616
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-sbsa-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202605060827-13616
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.15.1.29-skip-tritondevel-202605060827-13616
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.15.1.29-skip-tritondevel-202605060827-13616
11 changes: 9 additions & 2 deletions tensorrt_llm/evaluate/lm_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,12 +193,19 @@ def __init__(self,
chat_template_kwargs: Chat template kwargs as JSON string
output_dir: Directory to save the task infos.
"""
super().__init__(llm, sampling_params, streaming, output_dir=output_dir)
super().__init__(
llm,
sampling_params=sampling_params,
streaming=streaming,
chat_template_kwargs=chat_template_kwargs,
model_type=model_type,
is_force_single_image=is_force_single_image,
output_dir=output_dir,
)

# NOTE: Required by lm_eval to identify this as a multimodal model
self.MULTIMODAL = True
self.max_images = max_images
self.chat_template_kwargs = chat_template_kwargs
self.model_type = model_type if model_type is not None else self._get_model_type(
llm)
self.is_force_single_image = is_force_single_image
Expand Down
17 changes: 14 additions & 3 deletions tests/integration/defs/accuracy/references/mmmu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,22 @@ Efficient-Large-Model/NVILA-8B:
- accuracy: 47.77
Efficient-Large-Model/VILA1.5-3b:
- accuracy: 32.33
# MMMU for Nemotron-Nano-12B-v2-VL-BF16 requires reasoning on.
# While enabling reasoning for current test harness is not supported,
# the metric here is for model sanity checking.
# MMMU for Nemotron-Nano-* models require reasoning on.
# While enabling reasoning for current test harness is not supported, the metric here is for model
# regression testing, not an indicator of official benchmark results.
# See code comments in `test_llm_api_pytorch_multimodal.py` for more details.
nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16:
- accuracy: 26.67
nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16:
- accuracy: 39.0
nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8:
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 39.0
nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4:
- quant_algo: MIXED_PRECISION
kv_cache_quant_algo: FP8
accuracy: 38.0
microsoft/Phi-4-multimodal-instruct:
- accuracy: 53.67
Qwen/Qwen3-VL-30B-A3B-Instruct:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -464,3 +464,102 @@ def test_auto_dtype(self, max_num_tokens):
) as llm:
task = MMMU(self.MODEL_NAME)
task.evaluate(llm, sampling_params=self.sampling_params)


class TestNanoV3Omni(LlmapiAccuracyTestHarness):
# The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
# lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
# uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
# explicit image tiling/token accounting in the Mcore wrapper.
# We also keep the generation budget small for CI speed, and this evaluator
# does not strip reasoning traces after </think> before scoring. If the model
# ignores the non-thinking directive, answer extraction may see the reasoning.
EXTRA_EVALUATOR_KWARGS = dict(
apply_chat_template=True,
is_multimodal=True,
)

# NOTE: MMMU adds <|endoftext|> to the stop token.
sampling_params = SamplingParams(
max_tokens=MMMU.MAX_OUTPUT_LEN,
truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
stop="<|endoftext|>",
temperature=0.0,
top_k=1,
)

@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize(
"model_name,model_path,kv_cache_config,max_batch_size,expected_quant_algo",
[
pytest.param(
"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16",
f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16",
KvCacheConfig(
free_gpu_memory_fraction=0.8,
mamba_ssm_cache_dtype="float32",
enable_block_reuse=False,
),
32,
None,
id="bf16",
),
pytest.param(
"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8",
f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8",
KvCacheConfig(
free_gpu_memory_fraction=0.8,
mamba_ssm_cache_dtype="float32",
enable_block_reuse=False,
dtype="fp8",
),
64,
QuantAlgo.FP8,
marks=skip_pre_hopper,
id="fp8",
),
pytest.param(
"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
KvCacheConfig(
free_gpu_memory_fraction=0.8,
mamba_ssm_cache_dtype="float32",
enable_block_reuse=False,
dtype="fp8",
),
128,
QuantAlgo.MIXED_PRECISION,
marks=skip_pre_blackwell,
id="nvfp4",
),
],
)
def test_auto_dtype(
self,
model_name: str,
model_path: str,
kv_cache_config: KvCacheConfig,
max_batch_size: int,
expected_quant_algo: QuantAlgo | None,
) -> None:
task = MMMU(model_name)

with LLM(
model_path,
kv_cache_config=kv_cache_config,
enable_chunked_prefill=True,
# Keep the integration test fast; full benchmark-style runs may use
# a larger generation budget.
max_num_tokens=512,
# The amount of memory pre-allocated for mamba SSM states is proportional to the below,
# so lower it from its default of 2048.
# Quantized variants fit larger batches within the CI GPU memory budget.
max_batch_size=max_batch_size,
) as llm:
if expected_quant_algo is not None:
assert llm.args.quant_config.quant_algo == expected_quant_algo
task.evaluate(
llm,
sampling_params=self.sampling_params,
extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS,
)
3 changes: 3 additions & 0 deletions tests/integration/test_lists/qa/llm_function_core.txt
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,9 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_pr
accuracy/test_llm_api_pytorch_multimodal.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm]
accuracy/test_llm_api_pytorch_multimodal.py::TestMistralSmall24B::test_auto_dtype[forced_chunked_prefill]
accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype[forced_chunked_prefill]
accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[bf16]
accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[fp8]
accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4]
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
Expand Down
2 changes: 2 additions & 0 deletions tests/integration/test_lists/test-db/l0_b200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ l0_b200:
- accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8[enable_block_reuse=False]
- accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8[enable_block_reuse=True]
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1_block_reuse-cutlass]
- accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4]
- disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
Expand Down Expand Up @@ -284,6 +285,7 @@ l0_b200:
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16
- accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[bf16]
# ------------- AutoDeploy Backend Stages ---------------
- condition:
ranges:
Expand Down
1 change: 1 addition & 0 deletions tests/integration/test_lists/test-db/l0_h100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ l0_h100:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=fp8-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_dummy_load_format
- accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
- accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[fp8]
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_dummy_load_format
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
Expand Down
Loading