diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 298d93910715..2bd7305a2350 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -281,25 +281,29 @@ def uploadResults(def pipeline, SlurmCluster cluster, String clusterName, String downloadPerfResultSucceed = Utils.exec(pipeline, script: scpFromRemoteCmd(remote, scpSources, "${stageName}/"), returnStatus: true, numRetries: 3) == 0 } + // Download all slurm log files from the working directory + def slurmLogListOutput = Utils.exec( + pipeline, + script: Utils.sshUserCmd( + remote, + "\"find '${perfResultsBasePath}' -maxdepth 1 \\( -name '*.log' -o -name '*.out' -o -name 'slurm-*' \\) -printf '%f\\n' || true\"" + ), + returnStdout: true, + numRetries: 3 + )?.trim() ?: "" + def slurmLogFiles = slurmLogListOutput.split(/\s+/).collect { it.trim() }.findAll { it } + echo "Slurm Log Files: ${slurmLogFiles}" + if (slurmLogFiles) { + sh "mkdir -p ${stageName}/slurm-logs" + def slurmLogSources = slurmLogFiles.size() == 1 + ? "${perfResultsBasePath}/${slurmLogFiles[0]}" + : "{${slurmLogFiles.collect { "${perfResultsBasePath}/${it}" }.join(',')}}" + Utils.exec(pipeline, script: scpFromRemoteCmd(remote, slurmLogSources, "${stageName}/slurm-logs/"), returnStatus: true, numRetries: 3) + } + + echo "hasTimeoutTest: ${hasTimeoutTest}, downloadResultSucceed: ${downloadResultSucceed}, downloadPerfResultSucceed: ${downloadPerfResultSucceed}" - if (hasTimeoutTest || downloadResultSucceed || downloadPerfResultSucceed) { - // On retry attempts, rename freshly-downloaded result XMLs so that - // (a) the tar for this attempt is distinguishable from prior attempts - // already uploaded to Artifactory, and - // (b) the junit() glob below picks up this attempt's results as a - // separate set, keeping earlier attempts' test data visible in - // the Jenkins build report rather than overwriting it. - if (postTag) { - sh """ - cd ${stageName} - for f in results*.xml; do - [ -f "\$f" ] || continue - case "\$f" in *${postTag}.xml) continue ;; esac - name=\"\${f%.xml}\" - mv \"\$f\" \"\${name}${postTag}.xml\" || true - done - """ - } + if (hasTimeoutTest || downloadResultSucceed || downloadPerfResultSucceed || slurmLogFiles) { sh "ls -al ${stageName}/" echo "Upload test results." sh "tar -czvf results-${stageName}${postTag}.tar.gz ${stageName}/" diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index a7e49d9d369c..792372938952 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -408,7 +408,8 @@ def configure_kv_cache_capacity(self, mapping = self._mapping # TODO: support CP by generating dummy requests for it. - assert 'cp_type' not in mapping.cp_config + if not self._skip_est: + assert 'cp_type' not in mapping.cp_config fraction = self._kv_cache_config.free_gpu_memory_fraction diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index 21b5f523942e..1c6a2ac8343d 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -300,7 +300,7 @@ def create_py_executor( A fully initialized PyExecutor instance. """ - skip_est = os.environ.get("TRTLLM_SKIP_KV_CACHE_ESTIMATION", '0') == '1' + skip_est = os.environ.get("TRTLLM_SKIP_KV_CACHE_ESTIMATION", '1') == '1' llm_args, checkpoint_loader = _load_config_and_create_checkpoint_loader( llm_args, checkpoint_dir) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 8e0a98faf49e..af666ad36d68 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -2428,11 +2428,11 @@ class KvCacheConfig(StrictBaseModel, PybindMirror): description= "Number of sink tokens (tokens to always keep in attention window).") free_gpu_memory_fraction: Optional[float] = Field( - default=0.9, + default=0.7, ge=0, le=1, description= - "The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 90%. If both `max_tokens` and `free_gpu_memory_fraction` are specified, memory corresponding to the minimum will be used." + "The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 70%. If both `max_tokens` and `free_gpu_memory_fraction` are specified, memory corresponding to the minimum will be used." ) host_cache_size: Optional[int] = Field( default=None, diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 84deaeb223d2..8f1ec4ce13c0 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -539,7 +539,7 @@ def run_parallel_test(model_name: str, ) kv_cache_config = { - "free_gpu_memory_fraction": 0.5, + "free_gpu_memory_fraction": 0.35, "enable_block_reuse": True } ctx_server_config = { @@ -600,7 +600,8 @@ def test_auto_dtype(self, ctx_disable_overlap_scheduler, ctx_server_config = { "disable_overlap_scheduler": ctx_disable_overlap_scheduler, "kv_cache_config": { - "enable_block_reuse": ctx_enable_block_reuse + "enable_block_reuse": ctx_enable_block_reuse, + "free_gpu_memory_fraction": 0.4 } } ctx_server_config["cache_transceiver_config"] = { @@ -610,7 +611,8 @@ def test_auto_dtype(self, ctx_disable_overlap_scheduler, gen_server_config = { "disable_overlap_scheduler": gen_disable_overlap_scheduler, "kv_cache_config": { - "enable_block_reuse": gen_enable_block_reuse + "enable_block_reuse": gen_enable_block_reuse, + "free_gpu_memory_fraction": 0.3 } } gen_server_config["cache_transceiver_config"] = { @@ -687,7 +689,7 @@ def test_ngram(self): "is_public_pool": True } kv_cache_config = { - "free_gpu_memory_fraction": 0.5, + "free_gpu_memory_fraction": 0.35, "enable_block_reuse": False } ctx_server_config = { @@ -739,7 +741,7 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model): True, # BS=1 does not need overlap scheduling "speculative_config": speculative_decoding_config, "kv_cache_config": { - "free_gpu_memory_fraction": 0.5, + "free_gpu_memory_fraction": 0.3, "enable_block_reuse": True # reuse on context requests }, "max_num_tokens": 13393 * 2, @@ -754,7 +756,7 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model): "disable_overlap_scheduler": not overlap_scheduler, "speculative_config": speculative_decoding_config, "kv_cache_config": { - "free_gpu_memory_fraction": 0.5, + "free_gpu_memory_fraction": 0.3, "enable_block_reuse": False }, "max_num_tokens": 13393 * 2, @@ -791,14 +793,20 @@ def test_guided_decoding(self, backend: str, mocker): "cache_transceiver_config": { "backend": "DEFAULT", "max_tokens_in_buffer": 4096 - } + }, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.3 + }, } gen_server_config = { "guided_decoding_backend": backend, "cache_transceiver_config": { "backend": "DEFAULT", "max_tokens_in_buffer": 4096 - } + }, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.3 + }, } disaggregated_server_config = { "hostname": "localhost", @@ -834,7 +842,7 @@ def test_guided_decoding_with_eagle3(self, backend: str, "disable_overlap_scheduler": True, "speculative_config": speculative_decoding_config, "kv_cache_config": { - "free_gpu_memory_fraction": 0.8, + "free_gpu_memory_fraction": 0.3, }, "guided_decoding_backend": backend, "cache_transceiver_config": { @@ -847,7 +855,7 @@ def test_guided_decoding_with_eagle3(self, backend: str, "disable_overlap_scheduler": not eagle3_one_model, "speculative_config": speculative_decoding_config, "kv_cache_config": { - "free_gpu_memory_fraction": 0.8, + "free_gpu_memory_fraction": 0.3, }, "guided_decoding_backend": backend, "cache_transceiver_config": { @@ -938,6 +946,14 @@ def test_auto_dtype(self, overlap_scheduler): # Keep this low to avoid warmup OOM in CI ctx_server_config["max_seq_len"] = 8192 gen_server_config["max_seq_len"] = 8192 + + ctx_server_config["kv_cache_config"] = { + "free_gpu_memory_fraction": 0.6, + } + gen_server_config["kv_cache_config"] = { + "free_gpu_memory_fraction": 0.6, + } + disaggregated_server_config = { "hostname": "localhost", "backend": "pytorch", @@ -971,6 +987,9 @@ def test_nixl_backend(self): "cache_transceiver_config": { "backend": "NIXL", "max_tokens_in_buffer": 4096 + }, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.6 } } gen_server_config = { @@ -978,6 +997,9 @@ def test_nixl_backend(self): "cache_transceiver_config": { "backend": "NIXL", "max_tokens_in_buffer": 4096 + }, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.6 } } disaggregated_server_config = { @@ -1012,6 +1034,9 @@ def test_gen_only_sync(self): "transceiver_runtime": "PYTHON", "max_tokens_in_buffer": 4096, }, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.5 + } } gen_server_config = { "disable_overlap_scheduler": True, @@ -1020,6 +1045,9 @@ def test_gen_only_sync(self): "transceiver_runtime": "PYTHON", "max_tokens_in_buffer": 4096, }, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.5 + } } disaggregated_server_config = { "hostname": "localhost", @@ -1056,6 +1084,14 @@ def test_auto_dtype(self, overlap_scheduler, mtp_nextn): "backend": "DEFAULT", "max_tokens_in_buffer": 4096 } + + ctx_server_config["kv_cache_config"] = { + "free_gpu_memory_fraction": 0.6, + } + gen_server_config["kv_cache_config"] = { + "free_gpu_memory_fraction": 0.6, + } + if mtp_nextn > 0: ctx_server_config["speculative_config"] = { "decoding_type": "MTP", @@ -1065,6 +1101,7 @@ def test_auto_dtype(self, overlap_scheduler, mtp_nextn): "decoding_type": "MTP", "num_nextn_predict_layers": mtp_nextn } + disaggregated_server_config = { "hostname": "localhost", "backend": "pytorch", @@ -1187,7 +1224,7 @@ def test_guided_decoding(self, backend: str, mtp_nextn: int, mocker): ctx_server_config = { "disable_overlap_scheduler": True, "kv_cache_config": { - "free_gpu_memory_fraction": 0.8, + "free_gpu_memory_fraction": 0.55, }, "guided_decoding_backend": backend, "cache_transceiver_config": { @@ -1198,7 +1235,7 @@ def test_guided_decoding(self, backend: str, mtp_nextn: int, mocker): gen_server_config = { "disable_overlap_scheduler": False, "kv_cache_config": { - "free_gpu_memory_fraction": 0.8, + "free_gpu_memory_fraction": 0.55, }, "guided_decoding_backend": backend, "cache_transceiver_config": { @@ -1239,18 +1276,20 @@ def test_kv_cache_v2_nixl_python(self): "disable_overlap_scheduler": True, "kv_cache_config": { "enable_block_reuse": False, - "use_kv_cache_manager_v2": True + "use_kv_cache_manager_v2": True, + "free_gpu_memory_fraction": 0.55 }, "cache_transceiver_config": { "backend": "NIXL", - "transceiver_runtime": "PYTHON" + "transceiver_runtime": "PYTHON", } } gen_server_config = { "disable_overlap_scheduler": True, "kv_cache_config": { "enable_block_reuse": False, - "use_kv_cache_manager_v2": True + "use_kv_cache_manager_v2": True, + "free_gpu_memory_fraction": 0.55 }, "cache_transceiver_config": { "backend": "NIXL", @@ -1306,11 +1345,13 @@ def test_auto_dtype(self, block_reuse): "max_attention_window": [512, 512, 512, 512, 512, 32768], "enable_block_reuse": block_reuse, "enable_partial_reuse": block_reuse, + "free_gpu_memory_fraction": 0.65, } gen_server_config["kv_cache_config"] = { "max_attention_window": [512, 512, 512, 512, 512, 32768], "enable_block_reuse": block_reuse, "enable_partial_reuse": block_reuse, + "free_gpu_memory_fraction": 0.65, } disaggregated_server_config = { "hostname": "localhost", @@ -1619,6 +1660,9 @@ def test_nixl_backend(self): "cache_transceiver_config": { "backend": "NIXL", "max_tokens_in_buffer": 4096 + }, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.4 } } gen_server_config = { @@ -1626,6 +1670,9 @@ def test_nixl_backend(self): "cache_transceiver_config": { "backend": "NIXL", "max_tokens_in_buffer": 4096 + }, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.4 } } disaggregated_server_config = { @@ -1650,6 +1697,7 @@ def test_auto_dtype(self, overlap_scheduler, enable_partial_reuse): kv_cache_config = { "enable_block_reuse": True, "enable_partial_reuse": enable_partial_reuse, + "free_gpu_memory_fraction": 0.5, } ctx_server_config = { "disable_overlap_scheduler": True, @@ -1690,6 +1738,7 @@ def _test_chunked_prefill_helper(self, *, ctx_pp: int): kv_cache_config = { "enable_block_reuse": True, + "free_gpu_memory_fraction": 0.5 } ctx_server_config = { @@ -1712,6 +1761,7 @@ def _test_chunked_prefill_helper(self, *, ctx_pp: int): "max_tokens_in_buffer": 4096 }, "max_batch_size": max_batch_size, + "kv_cache_config": kv_cache_config, } disaggregated_server_config = { "hostname": "localhost", @@ -1774,7 +1824,7 @@ def test_auto_dtype_with_helix(self, comms_medium, cuda_graph_config, raise ValueError(f"Unknown comms_medium: {comms_medium}") gen_ep = gen_tp * gen_cp kv_cache_config = { - "free_gpu_memory_fraction": 0.5, + "free_gpu_memory_fraction": 0.35, "enable_block_reuse": False, "enable_partial_reuse": False, "tokens_per_block": 32, @@ -1944,7 +1994,7 @@ def test_nvfp4(self): "enable_attention_dp": True, "trust_remote_code": True, "kv_cache_config": { - "free_gpu_memory_fraction": 0.8, + "free_gpu_memory_fraction": 0.65, }, } gen_server_config = { @@ -1958,7 +2008,7 @@ def test_nvfp4(self): "enable_attention_dp": True, "trust_remote_code": True, "kv_cache_config": { - "free_gpu_memory_fraction": 0.8, + "free_gpu_memory_fraction": 0.65, }, } disaggregated_server_config = { diff --git a/tests/integration/defs/accuracy/test_dwdp_disaggregated_serving.py b/tests/integration/defs/accuracy/test_dwdp_disaggregated_serving.py index ce1644c04711..b415562dc232 100644 --- a/tests/integration/defs/accuracy/test_dwdp_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_dwdp_disaggregated_serving.py @@ -204,7 +204,7 @@ def test_dwdp_accuracy(self): "max_batch_size": 16, "max_num_tokens": 8192, "kv_cache_config": { - "free_gpu_memory_fraction": 0.4, + "free_gpu_memory_fraction": 0.23, "enable_block_reuse": False, "enable_partial_reuse": False, "tokens_per_block": 32, @@ -236,7 +236,7 @@ def test_dwdp_accuracy(self): "max_batch_size": 128, "max_num_tokens": 1024, "kv_cache_config": { - "free_gpu_memory_fraction": 0.5, + "free_gpu_memory_fraction": 0.33, "enable_block_reuse": False, "enable_partial_reuse": False, "tokens_per_block": 32, diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index c7aca9535a20..379260b10e60 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1735,7 +1735,7 @@ def test_bfloat16_python_scheduler(self, mtp_nextn, attention_dp, @pytest.mark.skip_less_device_memory(60000) def test_bfloat16_2_model_mtp(self): - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.3) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.2) pytorch_config = dict( disable_overlap_scheduler=True, cuda_graph_config=CudaGraphConfig(), @@ -1796,7 +1796,7 @@ def test_bfloat16_mtp_sa_global_pool(self): @parametrize_with_ids("mtp_nextn", [0, 2]) def test_bfloat16_4gpus_kv_cache_aware_routing(self, mtp_nextn): """Accuracy test for attention DP with KV cache-aware routing.""" - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75, + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.62, enable_block_reuse=True) pytorch_config = dict( disable_overlap_scheduler=False, @@ -1839,7 +1839,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, pp_partition[-1] = num_hidden_layers - sum(pp_partition[:-1]) else: pp_partition = None - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.40) torch_compile_config = _get_default_torch_compile_config(torch_compile) pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, @@ -1869,7 +1869,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, def test_bfloat16_4gpus_python_scheduler(self, tp_size, pp_size, ep_size, mtp_nextn): scheduler_config = SchedulerConfig(use_python_scheduler=True) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.65) pytorch_config = dict(cuda_graph_config=CudaGraphConfig(), ) mtp_config = None if mtp_nextn > 0: @@ -2426,7 +2426,7 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph, if moe_backend == "CUTEDSL" and sm_version not in (100, 103): pytest.skip(f"{moe_backend} backend supports SM 100 and 103 only") - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.65) # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp. torch_compile_config = _get_default_torch_compile_config(torch_compile) pytorch_config = dict( @@ -2784,7 +2784,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, if moe_backend == "TRTLLM" and sm_version in (120, 121): pytest.skip(f"{moe_backend} backend does not support SM 120 or 121") - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.50) pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None, @@ -3078,12 +3078,12 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, if is_sm_100f(): moe_backend = "DEEPGEMM" if moe_backend == "_DEFAULT" else moe_backend moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) else: if moe_backend != "_DEFAULT": pytest.skip("Not supported MoE backend!") moe_config = MoeConfig() - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7) pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, @@ -3198,7 +3198,7 @@ def test_skip_softmax_attention_multi_gpus(self, target_sparsity: float, "prefill": thr_prefill, "decode": thr_decode, }) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70, + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.43, enable_block_reuse=False) sm_version = get_sm_version() @@ -3252,12 +3252,12 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, if get_sm_version() == 100 or get_sm_version() == 103: moe_backend = "DEEPGEMM" if moe_backend == "_DEFAULT" else moe_backend moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.3) else: if moe_backend != "_DEFAULT": pytest.skip("Not supported MoE backend!") moe_config = MoeConfig() - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, @@ -3335,7 +3335,7 @@ def test_dsa_host_cache_offload(self, tp_size, pp_size, ep_size, mtp_nextn, # Use a small GPU fraction so that host offload is actually exercised. kv_cache_config = KvCacheConfig( - free_gpu_memory_fraction=0.4, + free_gpu_memory_fraction=0.28, host_cache_size=host_cache_size_gb * (1 << 30), ) @@ -4055,7 +4055,7 @@ class TestKimiK25(LlmapiAccuracyTestHarness): def test_nvfp4(self, ep_size, attention_dp): model_name = "moonshotai/Kimi-K2.5" model_path = f"{llm_models_root()}/Kimi-K2.5-NVFP4" - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) with LLM(model_path, tensor_parallel_size=8, @@ -4410,12 +4410,15 @@ def test_bf16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5, ) + with LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B" if is_cached else "Qwen/Qwen3-8B", tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, **pytorch_config, + kv_cache_config=kv_cache_config, enable_attention_dp=attention_dp) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) @@ -5020,7 +5023,7 @@ def test_skip_softmax_attention_4gpus(self, target_sparsity: float, "prefill": thr_prefill, "decode": thr_decode, }) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75, + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.45, enable_block_reuse=False, dtype="fp8" if fp8kv else "auto") @@ -5345,7 +5348,7 @@ def test_w4_4gpus(self, v2_kv_cache, kv_cache_reuse, kv_cache_dtype, cuda_graph_config=CudaGraphConfig() if cuda_graph else None, moe_config=MoeConfig(backend=moe_backend)) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7, + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.45, dtype=kv_cache_dtype, enable_block_reuse=kv_cache_reuse, use_kv_cache_manager_v2=v2_kv_cache) @@ -5573,7 +5576,7 @@ def test_eagle3_4gpus(self, v2_kv_cache, moe_backend, one_model, # https://nvbugs/5590408: 2-Model overlap scheduling has accuracy issue pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig()) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4, + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.25, dtype="auto", use_kv_cache_manager_v2=v2_kv_cache) @@ -5920,8 +5923,7 @@ class TestQwen3NextInstruct(LlmapiAccuracyTestHarness): def test_bf16_4gpu(self, tp_size, pp_size, ep_size, cuda_graph, overlap_scheduler, attention_dp, mocker): model_path = f"{self.MODEL_PATH}/Qwen3-Next-80B-A3B-Instruct" - - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8, + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5, enable_block_reuse=False) pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, @@ -6188,7 +6190,7 @@ def test_nvfp4(self, tp_size, ep_size, cuda_graph, overlap_scheduler, if not os.path.exists(model_path): pytest.skip(f"Model directory {model_path} does not exist") - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9, + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7, enable_block_reuse=enable_block_reuse) pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig( @@ -6226,7 +6228,7 @@ class TestSeedOss_36B(LlmapiAccuracyTestHarness): @pytest.mark.timeout(14400) @pytest.mark.skip_less_device_memory(140000) def test_auto_dtype(self): - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) chat_template_kwargs = dict(thinking_budget=-1) with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm: diff --git a/tests/integration/defs/disaggregated/disagg_test_utils.py b/tests/integration/defs/disaggregated/disagg_test_utils.py index d9691a0b1127..bcd862a5b041 100644 --- a/tests/integration/defs/disaggregated/disagg_test_utils.py +++ b/tests/integration/defs/disaggregated/disagg_test_utils.py @@ -437,6 +437,7 @@ def disagg_server_config(disagg_cluster_config, router, disagg_port): "disagg_cluster": disagg_cluster_config, "context_servers": {"router": {"type": router}}, "generation_servers": {"router": {"type": router}}, + "kv_cache_config": {"free_gpu_memory_fraction": 0.4}, } diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py index 3ce76105bd0c..07f3406b927f 100644 --- a/tests/integration/defs/perf/test_perf_sanity.py +++ b/tests/integration/defs/perf/test_perf_sanity.py @@ -27,6 +27,7 @@ import pytest import yaml +from tensorrt_llm import commands from test_common.error_utils import report_error from test_common.http_utils import wait_for_endpoint_ready @@ -667,7 +668,22 @@ class AggrTestCmds(NamedTuple): def get_server_logs(self, server_idx) -> List[str]: server_file_path = os.path.join(self.test_output_dir, f"trtllm-serve.{server_idx}.log") - return [server_file_path] + server_logs = [server_file_path] + # Include the SLURM-level aggregated server log (written by slurm_launch_draft.sh) + aggr_server_log = os.path.join(self.output_dir, "aggr_server.log") + server_logs.append(aggr_server_log) + return server_logs + + def collect_logs(self) -> None: + """Copy SLURM-level logs from output_dir to test_output_dir for artifact collection.""" + log_files = [os.path.join(self.output_dir, "aggr_server.log")] + log_files.append(os.path.join(self.output_dir, "job-output.log")) + log_files.extend(glob.glob(os.path.join(self.output_dir, "slurm-*.out"))) + for src in log_files: + if os.path.exists(src): + dst = os.path.join(self.test_output_dir, os.path.basename(src)) + shutil.copy2(src, dst) + print_info(f"Collected log: {src} -> {dst}") def run_cmd(self, server_idx: int) -> List[str]: """Run all clients for a server and return outputs.""" @@ -868,6 +884,20 @@ def get_server_logs(self, server_idx: int) -> List[str]: server_logs.append(os.path.join(self.output_dir, "disagg_server.log")) return server_logs + def collect_logs(self) -> None: + """Copy SLURM-level logs from output_dir to test_output_dir for artifact collection.""" + log_files = [] + for i in range(self.num_ctx_servers): + log_files.append(os.path.join(self.output_dir, f"ctx_server_{i}.log")) + for i in range(self.num_gen_servers): + log_files.append(os.path.join(self.output_dir, f"gen_server_{i}.log")) + log_files.append(os.path.join(self.output_dir, "disagg_server.log")) + for src in log_files: + if os.path.exists(src): + dst = os.path.join(self.test_output_dir, os.path.basename(src)) + shutil.copy2(src, dst) + print_info(f"Collected log: {src} -> {dst}") + @staticmethod def _wait_for_config_file(config_path: str, timeout: int = 600) -> None: """Wait for a config file to be written by the primary (_0) worker.""" @@ -909,9 +939,11 @@ def run_cmd(self, server_idx: int) -> List[str]: f"trtllm-serve.{self.disagg_serving_type}.{server_idx}.log", ) with open(server_file_path, "w") as server_ctx: + server_env = copy.deepcopy(os.environ) + server_env["TLLM_WORKER_LOG_FILE"] = server_file_path server_proc = subprocess.Popen( server_cmd, - env=copy.deepcopy(os.environ), + env=server_env, stdout=server_ctx, stderr=subprocess.STDOUT, ) @@ -930,9 +962,11 @@ def run_cmd(self, server_idx: int) -> List[str]: f"trtllm-serve.{self.disagg_serving_type}.{server_idx}.log", ) with open(disagg_server_file_path, "w") as disagg_server_ctx: + disagg_env = copy.deepcopy(os.environ) + disagg_env["TLLM_WORKER_LOG_FILE"] = disagg_server_file_path disagg_server_proc = subprocess.Popen( disagg_cmd, - env=copy.deepcopy(os.environ), + env=disagg_env, stdout=disagg_server_ctx, stderr=subprocess.STDOUT, ) @@ -1565,6 +1599,9 @@ def run_ex(self, commands) -> Dict[int, List[str]]: error_msg=e, log_files=commands.get_server_logs(server_idx), ) + finally: + # Copy SLURM-level logs to test_output_dir for artifact collection + commands.collect_logs() return outputs diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py index 8f502edbbce8..ea5753af3fec 100644 --- a/tests/integration/defs/stress_test/stress_test.py +++ b/tests/integration/defs/stress_test/stress_test.py @@ -90,8 +90,7 @@ class ServerConfig: ep_size: Optional[int] = 1 max_batch_size: Optional[int] = 1024 # 2048 is default value in BuildConfig max_num_tokens: Optional[int] = 8192 # 8192 is default value in BuildConfig - kv_cache_free_gpu_memory_fraction: Optional[ - float] = 0.9 # 0.9 is default value in BuildConfig + kv_cache_free_gpu_memory_fraction: Optional[float] = 0.6 capacity_scheduler_policy: str = "GUARANTEED_NO_EVICT" wait_interval: int = 10 # seconds max_wait_seconds: int = 600 # 10 mins <- Larger model need longer model loading time @@ -582,7 +581,7 @@ def stress_test(config, max_num_tokens= 8192, # DeepSeek-V3 or DeepSeek-R1 specific max_num_tokens kv_cache_free_gpu_memory_fraction= - 0.85, # DeepSeek-V3 or DeepSeek-R1 specific kv_cache fraction + 0.75, # DeepSeek-V3 or DeepSeek-R1 specific kv_cache fraction capacity_scheduler_policy=test_server_config. capacity_scheduler_policy, wait_interval=test_server_config.wait_interval, diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 3166ca140520..65428f9eafba 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -496,7 +496,8 @@ def test_trtllm_bench_llmapi_launch(llm_root, llm_venv, model_name, streaming=False, use_pytorch_backend=use_pytorch_backend, use_mpirun=True, - tp_size=2) + tp_size=2, + kv_cache_free_gpu_mem_fraction=0.5) runner() @@ -1499,6 +1500,8 @@ def test_ptp_quickstart_advanced_bs1(llm_root, llm_venv): "\"NVIDIA is a great company because\"", "--model_dir", f"{llm_models_root()}/{model_path}", + "--kv_cache_fraction", + f"0.5", ]) diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml index 0e194a0b1997..2fd880a1aced 100644 --- a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml @@ -29,7 +29,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.53 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 1 @@ -66,7 +66,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.42 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 1 diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_blackwell.yaml index df123fbc5df5..3fdbcce8198b 100644 --- a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_blackwell.yaml @@ -23,7 +23,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.667 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 3 @@ -59,7 +59,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.62 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 1 @@ -90,7 +90,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.646 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 3 @@ -126,7 +126,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.66 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 1 diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_grace_blackwell.yaml index 621a7542a60d..c915ac573fbc 100644 --- a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_grace_blackwell.yaml @@ -29,7 +29,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.75 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 1 @@ -127,7 +127,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.5837 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 1 @@ -225,7 +225,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.66 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 1 @@ -287,7 +287,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.657 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp8_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp8_blackwell.yaml index 7e83196a86d7..9d7dba787abb 100644 --- a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp8_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp8_blackwell.yaml @@ -23,7 +23,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.71 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 3 @@ -90,7 +90,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.7 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 3 @@ -157,7 +157,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.712 client_configs: - name: "con64_iter10_6k1k" concurrency: 64 diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_blackwell.yaml index b45d899d9e41..70c639961c90 100644 --- a/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_blackwell.yaml @@ -23,7 +23,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.612 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 3 @@ -59,7 +59,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.373 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 1 diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_grace_blackwell.yaml index d8b9d1735540..a9ee30afb278 100644 --- a/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_grace_blackwell.yaml @@ -23,7 +23,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.68 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 3 @@ -59,7 +59,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.68 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 1 @@ -126,7 +126,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.48 speculative_config: decoding_type: 'MTP' num_nextn_predict_layers: 1 diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml index debea1681f7e..478f5588f2dd 100644 --- a/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml @@ -35,7 +35,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.65 tokens_per_block: 64 cache_transceiver_config: backend: UCX diff --git a/tests/scripts/perf-sanity/aggregated/gpt_oss_120b_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/gpt_oss_120b_fp4_blackwell.yaml index cde7a71b2a2d..e5ec9eb2bcdb 100644 --- a/tests/scripts/perf-sanity/aggregated/gpt_oss_120b_fp4_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/gpt_oss_120b_fp4_blackwell.yaml @@ -23,7 +23,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.85 + free_gpu_memory_fraction: 0.808 num_postprocess_workers: 4 stream_interval: 20 client_configs: diff --git a/tests/scripts/perf-sanity/aggregated/gpt_oss_120b_fp4_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/gpt_oss_120b_fp4_grace_blackwell.yaml index cc5dbe906753..4ea943b9b633 100644 --- a/tests/scripts/perf-sanity/aggregated/gpt_oss_120b_fp4_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/gpt_oss_120b_fp4_grace_blackwell.yaml @@ -113,7 +113,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.769 num_postprocess_workers: 4 stream_interval: 20 client_configs: @@ -142,7 +142,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.774 speculative_config: decoding_type: 'Eagle' eagle3_layers_to_capture: [-1] @@ -177,7 +177,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.85 + free_gpu_memory_fraction: 0.81 num_postprocess_workers: 4 stream_interval: 20 client_configs: diff --git a/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_blackwell.yaml index fe1f29489771..a6b05bd97986 100644 --- a/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_blackwell.yaml @@ -24,7 +24,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.509 client_configs: - name: "con2_iter10_8k1k" concurrency: 2 @@ -90,7 +90,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.517 client_configs: - name: "con2_iter10_32k8k" concurrency: 2 @@ -126,7 +126,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.54 client_configs: - name: "con128_iter10_32k8k" concurrency: 128 diff --git a/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_grace_blackwell.yaml index 35335416f193..2d61474c29a4 100644 --- a/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_grace_blackwell.yaml @@ -24,7 +24,7 @@ server_configs: kv_cache_config: dtype: 'fp8' enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.394 client_configs: - name: "con2_iter10_8k1k" concurrency: 2 diff --git a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 94e8e0e4b4a2..e38cca683ba9 100644 --- a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -85,7 +85,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.349 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml index ac5c36f16665..e33f65bb86f6 100644 --- a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 256 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.73 dtype: fp8 moe_config: backend: CUTEDSL @@ -87,7 +87,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.57 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml index f986eeda6983..11176a4a3266 100644 --- a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 192 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.73 dtype: fp8 moe_config: backend: CUTEDSL @@ -87,7 +87,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.49 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 2b39be1b676d..2270115537b7 100644 --- a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -85,7 +85,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.349 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml index 12916108ff24..08d849213cce 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 8 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.709 dtype: fp8 moe_config: backend: CUTEDSL @@ -86,7 +86,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.3 + free_gpu_memory_fraction: 0.173 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index 0117cf4d0b5a..a99eeb932c7e 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 2 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.536 dtype: fp8 moe_config: backend: CUTEDSL @@ -86,7 +86,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.3 + free_gpu_memory_fraction: 0.21 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml index 5f9722f34ff2..86f77799de03 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml @@ -76,7 +76,7 @@ worker_config: - 2048 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.7 + free_gpu_memory_fraction: 0.60 dtype: fp8 moe_config: backend: CUTEDSL @@ -98,7 +98,7 @@ worker_config: disable_overlap_scheduler: true kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.85 + free_gpu_memory_fraction: 0.72 dtype: fp8 cache_transceiver_config: max_tokens_in_buffer: 4608 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index aa31b144efbc..3bf3e7779378 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 32 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.558 dtype: fp8 moe_config: backend: CUTEDSL @@ -87,7 +87,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.361 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml index 20706a01c585..e1c99c884995 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: print_iter_log: true kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.761 dtype: fp8 moe_config: backend: CUTLASS @@ -84,7 +84,7 @@ worker_config: disable_overlap_scheduler: true kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.75 + free_gpu_memory_fraction: 0.66 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index ce7d3b5be45d..33feec92fc75 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 128 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.828 dtype: fp8 moe_config: backend: TRTLLM @@ -86,7 +86,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.361 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml index 3370d1faa8cb..db3b8dca4256 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 768 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.662 dtype: fp8 moe_config: backend: CUTEDSL @@ -90,7 +90,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.361 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml index 43f0535d8383..f72003bec864 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 768 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.77 dtype: fp8 moe_config: backend: CUTEDSL @@ -87,7 +87,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.39 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index 8c08fcde53d2..54f1e96503e0 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 32 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.75 + free_gpu_memory_fraction: 0.698 dtype: fp8 moe_config: backend: CUTEDSL @@ -87,7 +87,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.361 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 60bec288c4a2..db7273b05e83 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 128 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.828 dtype: fp8 moe_config: backend: TRTLLM @@ -86,7 +86,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.361 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml index fcfbe0616235..f4b95909015b 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 32 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.551 dtype: fp8 tokens_per_block: 64 moe_config: @@ -97,7 +97,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.326 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 2c3b2b19759d..f9ea0d218d52 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 1 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.825 dtype: fp8 tokens_per_block: 64 moe_config: @@ -93,7 +93,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.326 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml index bf9ba21f0835..b115a9e79a2e 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 512 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.68 dtype: fp8 tokens_per_block: 64 moe_config: @@ -94,7 +94,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.35 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 2a91f3633c0d..63629e0767bb 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -62,7 +62,7 @@ worker_config: max_batch_size: 1 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.25 + free_gpu_memory_fraction: 0.236 dtype: fp8 tokens_per_block: 64 moe_config: @@ -98,7 +98,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.4 + free_gpu_memory_fraction: 0.10 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index 99002c9f2558..06d83e67471c 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -62,7 +62,7 @@ worker_config: max_batch_size: 8 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.5 + free_gpu_memory_fraction: 0.42 dtype: fp8 tokens_per_block: 64 moe_config: @@ -98,7 +98,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.4 + free_gpu_memory_fraction: 0.08 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml index 9158df38b979..82fc263960b3 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -63,7 +63,7 @@ worker_config: max_batch_size: 8 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.698 dtype: fp8 tokens_per_block: 64 moe_config: @@ -91,7 +91,7 @@ worker_config: max_batch_size: 32 kv_cache_config: enable_block_reuse: true - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.70 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml index e40afbe44267..c723f4bdea22 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 32 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.75 + free_gpu_memory_fraction: 0.688 dtype: fp8 tokens_per_block: 64 moe_config: @@ -97,7 +97,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.51 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 0a7b3a052e37..13dd85d7c3a6 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 1 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.80 dtype: fp8 tokens_per_block: 64 moe_config: @@ -93,7 +93,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.40 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml index 3a3747731f78..78c78688b3a0 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml @@ -63,7 +63,7 @@ worker_config: max_batch_size: 1536 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.836 dtype: fp8 moe_config: backend: TRTLLM @@ -85,7 +85,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.85 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml index ca773d4b268d..6a0b9c0e5bdc 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml @@ -81,7 +81,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.85 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml index 0765412f2e62..1831e13b4922 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 1024 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.86 dtype: fp8 moe_config: backend: TRTLLM @@ -81,7 +81,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.846 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml index bb027c657d2c..a0af7509465c 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 1024 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.86 dtype: fp8 moe_config: backend: TRTLLM @@ -81,7 +81,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.846 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml index 3a1d7fdbeffd..a83528237cbf 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml @@ -63,7 +63,7 @@ worker_config: max_batch_size: 512 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.848 dtype: fp8 moe_config: backend: TRTLLM @@ -85,7 +85,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.846 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml index 8eb7293b9a87..d6b841ce59f1 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 64 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.694 dtype: fp8 moe_config: backend: CUTEDSL @@ -85,7 +85,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.77 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml index 3ee92df5b4a2..52e24b56c544 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 4 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.599 dtype: fp8 moe_config: backend: TRTLLM @@ -83,7 +83,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.408 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index cef09d7dd066..09cae2151133 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -89,7 +89,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.388 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml index 6a74a4fcfc36..69ef63b95658 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 64 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.87 dtype: fp8 moe_config: backend: TRTLLM @@ -81,7 +81,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.52 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml index 28f8296df512..c2790abd37dd 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 8 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.709 dtype: fp8 moe_config: backend: CUTEDSL @@ -86,7 +86,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.3 + free_gpu_memory_fraction: 0.21 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index 0117cf4d0b5a..a99eeb932c7e 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 2 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.536 dtype: fp8 moe_config: backend: CUTEDSL @@ -86,7 +86,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.3 + free_gpu_memory_fraction: 0.21 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml index 5f9722f34ff2..86f77799de03 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml @@ -76,7 +76,7 @@ worker_config: - 2048 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.7 + free_gpu_memory_fraction: 0.60 dtype: fp8 moe_config: backend: CUTEDSL @@ -98,7 +98,7 @@ worker_config: disable_overlap_scheduler: true kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.85 + free_gpu_memory_fraction: 0.72 dtype: fp8 cache_transceiver_config: max_tokens_in_buffer: 4608 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index aa31b144efbc..3bf3e7779378 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 32 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.558 dtype: fp8 moe_config: backend: CUTEDSL @@ -87,7 +87,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.361 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml index 20706a01c585..e1c99c884995 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: print_iter_log: true kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.761 dtype: fp8 moe_config: backend: CUTLASS @@ -84,7 +84,7 @@ worker_config: disable_overlap_scheduler: true kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.75 + free_gpu_memory_fraction: 0.66 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index ce7d3b5be45d..33feec92fc75 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 128 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.828 dtype: fp8 moe_config: backend: TRTLLM @@ -86,7 +86,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.361 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-NIXL.yaml index 73dae2e6d765..0e1b73315533 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 768 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.667 dtype: fp8 moe_config: backend: CUTEDSL @@ -87,7 +87,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.361 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml index 3370d1faa8cb..db3b8dca4256 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 768 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.662 dtype: fp8 moe_config: backend: CUTEDSL @@ -90,7 +90,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.361 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml index 43f0535d8383..78e3ef77498b 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 768 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.50 dtype: fp8 moe_config: backend: CUTEDSL @@ -87,7 +87,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.31 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml index c6133efa8011..2f9cdb58691b 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 768 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.77 dtype: fp8 moe_config: backend: CUTEDSL @@ -87,7 +87,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.40 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index 8c08fcde53d2..54f1e96503e0 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 32 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.75 + free_gpu_memory_fraction: 0.698 dtype: fp8 moe_config: backend: CUTEDSL @@ -87,7 +87,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.361 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 60bec288c4a2..db7273b05e83 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 128 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.828 dtype: fp8 moe_config: backend: TRTLLM @@ -86,7 +86,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.361 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml index 1ed2d2cff294..aa6df6cb96f1 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 256 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.721 dtype: fp8 moe_config: backend: CUTEDSL @@ -87,7 +87,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.362 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml index fcfbe0616235..f4b95909015b 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 32 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.551 dtype: fp8 tokens_per_block: 64 moe_config: @@ -97,7 +97,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.326 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 2c3b2b19759d..f9ea0d218d52 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 1 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.825 dtype: fp8 tokens_per_block: 64 moe_config: @@ -93,7 +93,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.326 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml index bf9ba21f0835..05d104b6f7f4 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 512 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.52 dtype: fp8 tokens_per_block: 64 moe_config: @@ -94,7 +94,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.300 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml index 86b2ca6ea9ba..9474fb85a080 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 512 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.70 dtype: fp8 tokens_per_block: 64 moe_config: @@ -94,7 +94,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.365 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 2a91f3633c0d..797cfa3f9bcb 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -62,7 +62,7 @@ worker_config: max_batch_size: 1 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.25 + free_gpu_memory_fraction: 0.236 dtype: fp8 tokens_per_block: 64 moe_config: @@ -98,7 +98,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.4 + free_gpu_memory_fraction: 0.097 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml index a2b8f7b1a8f4..26bd6a85c93a 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml @@ -62,7 +62,7 @@ worker_config: max_batch_size: 64 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.5 + free_gpu_memory_fraction: 0.47 dtype: fp8 tokens_per_block: 64 moe_config: @@ -101,7 +101,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.4 + free_gpu_memory_fraction: 0.36 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index 99002c9f2558..d0410e55b617 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -62,7 +62,7 @@ worker_config: max_batch_size: 8 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.5 + free_gpu_memory_fraction: 0.48 dtype: fp8 tokens_per_block: 64 moe_config: @@ -98,7 +98,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.4 + free_gpu_memory_fraction: 0.08 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml index 9158df38b979..82fc263960b3 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -63,7 +63,7 @@ worker_config: max_batch_size: 8 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.698 dtype: fp8 tokens_per_block: 64 moe_config: @@ -91,7 +91,7 @@ worker_config: max_batch_size: 32 kv_cache_config: enable_block_reuse: true - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.70 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml index e40afbe44267..c723f4bdea22 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 32 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.75 + free_gpu_memory_fraction: 0.688 dtype: fp8 tokens_per_block: 64 moe_config: @@ -97,7 +97,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.51 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 0a7b3a052e37..1cdf912230f7 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 1 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.80 dtype: fp8 tokens_per_block: 64 moe_config: @@ -93,7 +93,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.38 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml index 2c4a93be46a0..8bb304f3c01e 100644 --- a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 128 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.67 dtype: fp8 tokens_per_block: 64 moe_config: @@ -94,7 +94,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.464 dtype: fp8 tokens_per_block: 64 moe_config: diff --git a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml index c739b0f68751..2df89d75934a 100644 --- a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml @@ -63,7 +63,7 @@ worker_config: max_batch_size: 1536 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.836 dtype: fp8 moe_config: backend: TRTLLM @@ -85,7 +85,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.85 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml index 3a3747731f78..78c78688b3a0 100644 --- a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml @@ -63,7 +63,7 @@ worker_config: max_batch_size: 1536 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.836 dtype: fp8 moe_config: backend: TRTLLM @@ -85,7 +85,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.85 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml index ca773d4b268d..6a0b9c0e5bdc 100644 --- a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml @@ -81,7 +81,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.85 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml index ec990076ef08..0f77fbd0965c 100644 --- a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml @@ -68,7 +68,7 @@ worker_config: max_batch_size: 1280 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.85 + free_gpu_memory_fraction: 0.815 dtype: fp8 moe_config: backend: TRTLLM @@ -94,7 +94,7 @@ worker_config: max_batch_size: 30 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.779 dtype: fp8 moe_config: backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml index 0765412f2e62..1831e13b4922 100644 --- a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 1024 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.86 dtype: fp8 moe_config: backend: TRTLLM @@ -81,7 +81,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.846 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml index bb027c657d2c..a0af7509465c 100644 --- a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 1024 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.86 dtype: fp8 moe_config: backend: TRTLLM @@ -81,7 +81,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.846 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml index 3a1d7fdbeffd..a83528237cbf 100644 --- a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml @@ -63,7 +63,7 @@ worker_config: max_batch_size: 512 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.848 dtype: fp8 moe_config: backend: TRTLLM @@ -85,7 +85,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.846 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml index 8eb7293b9a87..6dcbe19a38fd 100644 --- a/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 64 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.694 dtype: fp8 moe_config: backend: CUTEDSL @@ -85,7 +85,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.5 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml index 1ea511a1e7a3..eaff1b8b36ba 100644 --- a/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 512 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.708 dtype: fp8 moe_config: backend: CUTEDSL @@ -85,7 +85,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.408 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml index 3ee92df5b4a2..52e24b56c544 100644 --- a/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 4 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.599 dtype: fp8 moe_config: backend: TRTLLM @@ -83,7 +83,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.408 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml index d84d38d742b7..80af732fac5e 100644 --- a/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 256 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.735 dtype: fp8 moe_config: backend: CUTEDSL @@ -85,7 +85,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.405 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index cef09d7dd066..09cae2151133 100644 --- a/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -89,7 +89,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.388 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml index a97acb893057..8b2da2bee0ca 100644 --- a/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml @@ -60,7 +60,7 @@ worker_config: max_batch_size: 128 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.841 dtype: fp8 moe_config: backend: CUTEDSL @@ -83,7 +83,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.486 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml index 6a74a4fcfc36..69ef63b95658 100644 --- a/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml @@ -59,7 +59,7 @@ worker_config: max_batch_size: 64 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.87 dtype: fp8 moe_config: backend: TRTLLM @@ -81,7 +81,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.52 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml index 686fb268738a..3e80a62e5124 100644 --- a/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml @@ -61,7 +61,7 @@ worker_config: max_batch_size: 16 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.75 dtype: fp8 moe_config: backend: CUTEDSL @@ -87,7 +87,7 @@ worker_config: cuda_graph_config: null kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.3 + free_gpu_memory_fraction: 0.27 dtype: fp8 moe_config: backend: CUTEDSL diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py index e753ee63679f..96e26438ad41 100644 --- a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py +++ b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py @@ -54,7 +54,9 @@ def create_nemotron_h_llm(model_folder, cuda_graph_config=CudaGraphConfig() if use_cuda_graph else None, disable_overlap_scheduler=disable_overlap_scheduler, kv_cache_config=KvCacheConfig( - mamba_ssm_cache_dtype=mamba_ssm_cache_dtype) + enable_block_reuse=False, + mamba_ssm_cache_dtype=mamba_ssm_cache_dtype, + free_gpu_memory_fraction=0.5) if mamba_ssm_cache_dtype is not None else KvCacheConfig(), enable_chunked_prefill=enable_chunked_prefill, **kwargs, diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron_nano_v2_vl.py b/tests/unittest/_torch/modeling/test_modeling_nemotron_nano_v2_vl.py index 8b2060a25705..6d437612d369 100644 --- a/tests/unittest/_torch/modeling/test_modeling_nemotron_nano_v2_vl.py +++ b/tests/unittest/_torch/modeling/test_modeling_nemotron_nano_v2_vl.py @@ -73,7 +73,9 @@ def nano_llm_model(): tensor_parallel_size=1, max_batch_size=2, cuda_graph_config=CudaGraphConfig(), - kv_cache_config=KvCacheConfig(enable_block_reuse=False, mamba_ssm_cache_dtype="float32"), + kv_cache_config=KvCacheConfig( + enable_block_reuse=False, mamba_ssm_cache_dtype="float32", free_gpu_memory_fraction=0.4 + ), ) yield nano_llm diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py index f06da5cc4e21..073cc6674b41 100644 --- a/tests/unittest/_torch/speculative/test_eagle3.py +++ b/tests/unittest/_torch/speculative/test_eagle3.py @@ -368,11 +368,13 @@ def test_llama_eagle3_long_prompt(use_cuda_graph): else: cuda_graph_config = None + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) llm_spec = LLM(model=target_model_dir, speculative_config=spec_config, max_batch_size=1, cuda_graph_config=cuda_graph_config, - disable_overlap_scheduler=True) + disable_overlap_scheduler=True, + kv_cache_config=kv_cache_config) prompt = [", ".join(str(i) for i in range(1000))] diff --git a/tests/unittest/llmapi/apps/_test_openai_lora.py b/tests/unittest/llmapi/apps/_test_openai_lora.py index 8e624122428c..457457862c5a 100644 --- a/tests/unittest/llmapi/apps/_test_openai_lora.py +++ b/tests/unittest/llmapi/apps/_test_openai_lora.py @@ -42,7 +42,10 @@ def temp_extra_llm_api_options_file(): }, # Disable CUDA graph # TODO: remove this once we have a proper fix for CUDA graph in LoRA - "cuda_graph_config": None + "cuda_graph_config": None, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.5 + }, } with open(temp_file_path, 'w') as f: diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_lora.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_lora.py index e94c30662b1a..37c9172ae9cd 100644 --- a/tests/unittest/llmapi/apps/_test_trtllm_serve_lora.py +++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_lora.py @@ -28,7 +28,10 @@ def temp_extra_llm_api_options_file(): "max_lora_rank": 8, "max_loras": 4, "max_cpu_loras": 4, - } + }, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.5 + }, } with open(temp_file_path, 'w') as f: