Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 22 additions & 18 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -281,25 +281,29 @@ def uploadResults(def pipeline, SlurmCluster cluster, String clusterName, String
downloadPerfResultSucceed = Utils.exec(pipeline, script: scpFromRemoteCmd(remote, scpSources, "${stageName}/"), returnStatus: true, numRetries: 3) == 0
}

// Download all slurm log files from the working directory
def slurmLogListOutput = Utils.exec(
pipeline,
script: Utils.sshUserCmd(
remote,
"\"find '${perfResultsBasePath}' -maxdepth 1 \\( -name '*.log' -o -name '*.out' -o -name 'slurm-*' \\) -printf '%f\\n' || true\""
),
returnStdout: true,
numRetries: 3
)?.trim() ?: ""
def slurmLogFiles = slurmLogListOutput.split(/\s+/).collect { it.trim() }.findAll { it }
echo "Slurm Log Files: ${slurmLogFiles}"
if (slurmLogFiles) {
sh "mkdir -p ${stageName}/slurm-logs"
def slurmLogSources = slurmLogFiles.size() == 1
? "${perfResultsBasePath}/${slurmLogFiles[0]}"
: "{${slurmLogFiles.collect { "${perfResultsBasePath}/${it}" }.join(',')}}"
Utils.exec(pipeline, script: scpFromRemoteCmd(remote, slurmLogSources, "${stageName}/slurm-logs/"), returnStatus: true, numRetries: 3)
}


echo "hasTimeoutTest: ${hasTimeoutTest}, downloadResultSucceed: ${downloadResultSucceed}, downloadPerfResultSucceed: ${downloadPerfResultSucceed}"
if (hasTimeoutTest || downloadResultSucceed || downloadPerfResultSucceed) {
// On retry attempts, rename freshly-downloaded result XMLs so that
// (a) the tar for this attempt is distinguishable from prior attempts
// already uploaded to Artifactory, and
// (b) the junit() glob below picks up this attempt's results as a
// separate set, keeping earlier attempts' test data visible in
// the Jenkins build report rather than overwriting it.
if (postTag) {
sh """
cd ${stageName}
for f in results*.xml; do
[ -f "\$f" ] || continue
case "\$f" in *${postTag}.xml) continue ;; esac
name=\"\${f%.xml}\"
mv \"\$f\" \"\${name}${postTag}.xml\" || true
done
"""
}
if (hasTimeoutTest || downloadResultSucceed || downloadPerfResultSucceed || slurmLogFiles) {
sh "ls -al ${stageName}/"
echo "Upload test results."
sh "tar -czvf results-${stageName}${postTag}.tar.gz ${stageName}/"
Expand Down
3 changes: 2 additions & 1 deletion tensorrt_llm/_torch/pyexecutor/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,8 @@ def configure_kv_cache_capacity(self,
mapping = self._mapping

# TODO: support CP by generating dummy requests for it.
assert 'cp_type' not in mapping.cp_config
if not self._skip_est:
assert 'cp_type' not in mapping.cp_config

fraction = self._kv_cache_config.free_gpu_memory_fraction

Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ def create_py_executor(
A fully initialized PyExecutor instance.
"""

skip_est = os.environ.get("TRTLLM_SKIP_KV_CACHE_ESTIMATION", '0') == '1'
skip_est = os.environ.get("TRTLLM_SKIP_KV_CACHE_ESTIMATION", '1') == '1'
llm_args, checkpoint_loader = _load_config_and_create_checkpoint_loader(
llm_args, checkpoint_dir)

Expand Down
4 changes: 2 additions & 2 deletions tensorrt_llm/llmapi/llm_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -2428,11 +2428,11 @@ class KvCacheConfig(StrictBaseModel, PybindMirror):
description=
"Number of sink tokens (tokens to always keep in attention window).")
free_gpu_memory_fraction: Optional[float] = Field(
default=0.9,
default=0.7,
ge=0,
le=1,
description=
"The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 90%. If both `max_tokens` and `free_gpu_memory_fraction` are specified, memory corresponding to the minimum will be used."
"The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 70%. If both `max_tokens` and `free_gpu_memory_fraction` are specified, memory corresponding to the minimum will be used."
)
host_cache_size: Optional[int] = Field(
default=None,
Expand Down
86 changes: 68 additions & 18 deletions tests/integration/defs/accuracy/test_disaggregated_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,7 @@ def run_parallel_test(model_name: str,
)

kv_cache_config = {
"free_gpu_memory_fraction": 0.5,
"free_gpu_memory_fraction": 0.35,
"enable_block_reuse": True
}
ctx_server_config = {
Expand Down Expand Up @@ -600,7 +600,8 @@ def test_auto_dtype(self, ctx_disable_overlap_scheduler,
ctx_server_config = {
"disable_overlap_scheduler": ctx_disable_overlap_scheduler,
"kv_cache_config": {
"enable_block_reuse": ctx_enable_block_reuse
"enable_block_reuse": ctx_enable_block_reuse,
"free_gpu_memory_fraction": 0.4
}
}
ctx_server_config["cache_transceiver_config"] = {
Expand All @@ -610,7 +611,8 @@ def test_auto_dtype(self, ctx_disable_overlap_scheduler,
gen_server_config = {
"disable_overlap_scheduler": gen_disable_overlap_scheduler,
"kv_cache_config": {
"enable_block_reuse": gen_enable_block_reuse
"enable_block_reuse": gen_enable_block_reuse,
"free_gpu_memory_fraction": 0.3
}
}
gen_server_config["cache_transceiver_config"] = {
Expand Down Expand Up @@ -687,7 +689,7 @@ def test_ngram(self):
"is_public_pool": True
}
kv_cache_config = {
"free_gpu_memory_fraction": 0.5,
"free_gpu_memory_fraction": 0.35,
"enable_block_reuse": False
}
ctx_server_config = {
Expand Down Expand Up @@ -739,7 +741,7 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model):
True, # BS=1 does not need overlap scheduling
"speculative_config": speculative_decoding_config,
"kv_cache_config": {
"free_gpu_memory_fraction": 0.5,
"free_gpu_memory_fraction": 0.3,
"enable_block_reuse": True # reuse on context requests
},
"max_num_tokens": 13393 * 2,
Expand All @@ -754,7 +756,7 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model):
"disable_overlap_scheduler": not overlap_scheduler,
"speculative_config": speculative_decoding_config,
"kv_cache_config": {
"free_gpu_memory_fraction": 0.5,
"free_gpu_memory_fraction": 0.3,
"enable_block_reuse": False
},
"max_num_tokens": 13393 * 2,
Expand Down Expand Up @@ -791,14 +793,20 @@ def test_guided_decoding(self, backend: str, mocker):
"cache_transceiver_config": {
"backend": "DEFAULT",
"max_tokens_in_buffer": 4096
}
},
"kv_cache_config": {
"free_gpu_memory_fraction": 0.3
},
}
gen_server_config = {
"guided_decoding_backend": backend,
"cache_transceiver_config": {
"backend": "DEFAULT",
"max_tokens_in_buffer": 4096
}
},
"kv_cache_config": {
"free_gpu_memory_fraction": 0.3
},
}
disaggregated_server_config = {
"hostname": "localhost",
Expand Down Expand Up @@ -834,7 +842,7 @@ def test_guided_decoding_with_eagle3(self, backend: str,
"disable_overlap_scheduler": True,
"speculative_config": speculative_decoding_config,
"kv_cache_config": {
"free_gpu_memory_fraction": 0.8,
"free_gpu_memory_fraction": 0.3,
},
"guided_decoding_backend": backend,
"cache_transceiver_config": {
Expand All @@ -847,7 +855,7 @@ def test_guided_decoding_with_eagle3(self, backend: str,
"disable_overlap_scheduler": not eagle3_one_model,
"speculative_config": speculative_decoding_config,
"kv_cache_config": {
"free_gpu_memory_fraction": 0.8,
"free_gpu_memory_fraction": 0.3,
},
"guided_decoding_backend": backend,
"cache_transceiver_config": {
Expand Down Expand Up @@ -938,6 +946,14 @@ def test_auto_dtype(self, overlap_scheduler):
# Keep this low to avoid warmup OOM in CI
ctx_server_config["max_seq_len"] = 8192
gen_server_config["max_seq_len"] = 8192

ctx_server_config["kv_cache_config"] = {
"free_gpu_memory_fraction": 0.6,
}
gen_server_config["kv_cache_config"] = {
"free_gpu_memory_fraction": 0.6,
}

disaggregated_server_config = {
"hostname": "localhost",
"backend": "pytorch",
Expand Down Expand Up @@ -971,13 +987,19 @@ def test_nixl_backend(self):
"cache_transceiver_config": {
"backend": "NIXL",
"max_tokens_in_buffer": 4096
},
"kv_cache_config": {
"free_gpu_memory_fraction": 0.6
}
}
gen_server_config = {
"disable_overlap_scheduler": True,
"cache_transceiver_config": {
"backend": "NIXL",
"max_tokens_in_buffer": 4096
},
"kv_cache_config": {
"free_gpu_memory_fraction": 0.6
}
}
disaggregated_server_config = {
Expand Down Expand Up @@ -1012,6 +1034,9 @@ def test_gen_only_sync(self):
"transceiver_runtime": "PYTHON",
"max_tokens_in_buffer": 4096,
},
"kv_cache_config": {
"free_gpu_memory_fraction": 0.5
}
}
gen_server_config = {
"disable_overlap_scheduler": True,
Expand All @@ -1020,6 +1045,9 @@ def test_gen_only_sync(self):
"transceiver_runtime": "PYTHON",
"max_tokens_in_buffer": 4096,
},
"kv_cache_config": {
"free_gpu_memory_fraction": 0.5
}
}
disaggregated_server_config = {
"hostname": "localhost",
Expand Down Expand Up @@ -1056,6 +1084,14 @@ def test_auto_dtype(self, overlap_scheduler, mtp_nextn):
"backend": "DEFAULT",
"max_tokens_in_buffer": 4096
}

ctx_server_config["kv_cache_config"] = {
"free_gpu_memory_fraction": 0.6,
}
gen_server_config["kv_cache_config"] = {
"free_gpu_memory_fraction": 0.6,
}

if mtp_nextn > 0:
ctx_server_config["speculative_config"] = {
"decoding_type": "MTP",
Expand All @@ -1065,6 +1101,7 @@ def test_auto_dtype(self, overlap_scheduler, mtp_nextn):
"decoding_type": "MTP",
"num_nextn_predict_layers": mtp_nextn
}

disaggregated_server_config = {
"hostname": "localhost",
"backend": "pytorch",
Expand Down Expand Up @@ -1187,7 +1224,7 @@ def test_guided_decoding(self, backend: str, mtp_nextn: int, mocker):
ctx_server_config = {
"disable_overlap_scheduler": True,
"kv_cache_config": {
"free_gpu_memory_fraction": 0.8,
"free_gpu_memory_fraction": 0.55,
},
"guided_decoding_backend": backend,
"cache_transceiver_config": {
Expand All @@ -1198,7 +1235,7 @@ def test_guided_decoding(self, backend: str, mtp_nextn: int, mocker):
gen_server_config = {
"disable_overlap_scheduler": False,
"kv_cache_config": {
"free_gpu_memory_fraction": 0.8,
"free_gpu_memory_fraction": 0.55,
},
"guided_decoding_backend": backend,
"cache_transceiver_config": {
Expand Down Expand Up @@ -1239,18 +1276,20 @@ def test_kv_cache_v2_nixl_python(self):
"disable_overlap_scheduler": True,
"kv_cache_config": {
"enable_block_reuse": False,
"use_kv_cache_manager_v2": True
"use_kv_cache_manager_v2": True,
"free_gpu_memory_fraction": 0.55
},
"cache_transceiver_config": {
"backend": "NIXL",
"transceiver_runtime": "PYTHON"
"transceiver_runtime": "PYTHON",
}
}
gen_server_config = {
"disable_overlap_scheduler": True,
"kv_cache_config": {
"enable_block_reuse": False,
"use_kv_cache_manager_v2": True
"use_kv_cache_manager_v2": True,
"free_gpu_memory_fraction": 0.55
},
"cache_transceiver_config": {
"backend": "NIXL",
Expand Down Expand Up @@ -1306,11 +1345,13 @@ def test_auto_dtype(self, block_reuse):
"max_attention_window": [512, 512, 512, 512, 512, 32768],
"enable_block_reuse": block_reuse,
"enable_partial_reuse": block_reuse,
"free_gpu_memory_fraction": 0.65,
}
gen_server_config["kv_cache_config"] = {
"max_attention_window": [512, 512, 512, 512, 512, 32768],
"enable_block_reuse": block_reuse,
"enable_partial_reuse": block_reuse,
"free_gpu_memory_fraction": 0.65,
}
disaggregated_server_config = {
"hostname": "localhost",
Expand Down Expand Up @@ -1619,13 +1660,19 @@ def test_nixl_backend(self):
"cache_transceiver_config": {
"backend": "NIXL",
"max_tokens_in_buffer": 4096
},
"kv_cache_config": {
"free_gpu_memory_fraction": 0.4
}
}
gen_server_config = {
"disable_overlap_scheduler": True,
"cache_transceiver_config": {
"backend": "NIXL",
"max_tokens_in_buffer": 4096
},
"kv_cache_config": {
"free_gpu_memory_fraction": 0.4
}
}
disaggregated_server_config = {
Expand All @@ -1650,6 +1697,7 @@ def test_auto_dtype(self, overlap_scheduler, enable_partial_reuse):
kv_cache_config = {
"enable_block_reuse": True,
"enable_partial_reuse": enable_partial_reuse,
"free_gpu_memory_fraction": 0.5,
}
ctx_server_config = {
"disable_overlap_scheduler": True,
Expand Down Expand Up @@ -1690,6 +1738,7 @@ def _test_chunked_prefill_helper(self, *, ctx_pp: int):

kv_cache_config = {
"enable_block_reuse": True,
"free_gpu_memory_fraction": 0.5
}

ctx_server_config = {
Expand All @@ -1712,6 +1761,7 @@ def _test_chunked_prefill_helper(self, *, ctx_pp: int):
"max_tokens_in_buffer": 4096
},
"max_batch_size": max_batch_size,
"kv_cache_config": kv_cache_config,
}
disaggregated_server_config = {
"hostname": "localhost",
Expand Down Expand Up @@ -1774,7 +1824,7 @@ def test_auto_dtype_with_helix(self, comms_medium, cuda_graph_config,
raise ValueError(f"Unknown comms_medium: {comms_medium}")
gen_ep = gen_tp * gen_cp
kv_cache_config = {
"free_gpu_memory_fraction": 0.5,
"free_gpu_memory_fraction": 0.35,
"enable_block_reuse": False,
"enable_partial_reuse": False,
"tokens_per_block": 32,
Expand Down Expand Up @@ -1944,7 +1994,7 @@ def test_nvfp4(self):
"enable_attention_dp": True,
"trust_remote_code": True,
"kv_cache_config": {
"free_gpu_memory_fraction": 0.8,
"free_gpu_memory_fraction": 0.65,
},
}
gen_server_config = {
Expand All @@ -1958,7 +2008,7 @@ def test_nvfp4(self):
"enable_attention_dp": True,
"trust_remote_code": True,
"kv_cache_config": {
"free_gpu_memory_fraction": 0.8,
"free_gpu_memory_fraction": 0.65,
},
}
disaggregated_server_config = {
Expand Down
Loading
Loading