Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion core/runtime/BUILD
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
load("@rules_cc//cc:defs.bzl", "cc_library")
load("@rules_pkg//:pkg.bzl", "pkg_tar")
load("@rules_pkg//pkg:mappings.bzl", "pkg_files")

package(default_visibility = ["//visibility:public"])

config_setting(
Expand Down Expand Up @@ -66,6 +67,7 @@ cc_library(
"RTDevice.cpp",
"TRTEngine.cpp",
"TRTEngineProfiler.cpp",
"TRTRuntimeConfig.cpp",
"execute_engine.cpp",
"register_jit_hooks.cpp",
"runtime.cpp",
Expand All @@ -75,11 +77,19 @@ cc_library(
"RTDevice.h",
"TRTEngine.h",
"TRTEngineProfiler.h",
"TRTRuntimeConfig.h",
"runtime.h",
],
linkopts = [
"-lstdc++fs",
],
local_defines = select({
Comment thread
tp5uiuc marked this conversation as resolved.
# TensorRT-RTX builds: opt into feature-gated APIs that the runtime layer
# depends on (e.g. IExecutionContext::isStreamCapturable).
":rtx_win": ["ENABLE_FEATURE_DISABLE_RUNTIME_ALLOCATION"],
":rtx_x86_64": ["ENABLE_FEATURE_DISABLE_RUNTIME_ALLOCATION"],
"//conditions:default": [],
}),
deps = [
"//core/plugins:torch_tensorrt_plugins",
"//core/util:prelude",
Expand Down Expand Up @@ -107,6 +117,7 @@ filegroup(
"RTDevice.h",
"TRTEngine.h",
"TRTEngineProfiler.h",
"TRTRuntimeConfig.h",
"runtime.h",
],
visibility = ["//visibility:public"],
Expand All @@ -121,6 +132,6 @@ pkg_tar(
pkg_files(
name = "include_pkg_files",
srcs = [":include_files"],
visibility = ["//visibility:public"],
prefix = "include/torch_tensorrt/core/runtime/",
visibility = ["//visibility:public"],
)
111 changes: 76 additions & 35 deletions core/runtime/TRTEngine.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <algorithm>
#include <filesystem>

#include <cuda_runtime.h>
#include "NvInfer.h"
Expand Down Expand Up @@ -54,26 +55,28 @@ void DynamicOutputAllocator::notifyShape(char const* tensorName, nvinfer1::Dims
}

TRTEngine::TRTEngine(
const std::string& serialized_engine,
std::string serialized_engine,
Comment thread
tp5uiuc marked this conversation as resolved.
const RTDevice& cuda_device,
const std::vector<std::string>& _in_binding_names,
const std::vector<std::string>& _out_binding_names,
const Platform& target_platform,
bool hardware_compatible,
bool requires_output_allocator,
const std::string& serialized_metadata,
const ResourceAllocationStrategy resource_allocation_strategy)
std::string serialized_metadata,
const ResourceAllocationStrategy resource_allocation_strategy,
TRTRuntimeConfig runtime_cfg)
: TRTEngine(
"deserialized_trt",
serialized_engine,
std::move(serialized_engine),
cuda_device,
_in_binding_names,
_out_binding_names,
target_platform,
hardware_compatible,
requires_output_allocator,
serialized_metadata,
resource_allocation_strategy) {}
std::move(serialized_metadata),
resource_allocation_strategy,
std::move(runtime_cfg)) {}

TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
: TRTEngine(
Expand All @@ -88,19 +91,22 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
serialized_info[SERIALIZED_METADATA_IDX],
(static_cast<bool>(std::stoi(serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX]))
? ResourceAllocationStrategy::kDynamic
: ResourceAllocationStrategy::kStatic)) {}
: ResourceAllocationStrategy::kStatic),
make_runtime_config_from_serialized(serialized_info)) {}

TRTEngine::TRTEngine(
const std::string& mod_name,
const std::string& serialized_engine,
std::string mod_name,
std::string serialized_engine,
const RTDevice& cuda_device,
const std::vector<std::string>& _in_binding_names,
const std::vector<std::string>& _out_binding_names,
const Platform& target_platform,
bool hardware_compatible,
bool requires_output_allocator,
const std::string& serialized_metadata,
const ResourceAllocationStrategy resource_allocation_strategy) {
std::string serialized_metadata,
const ResourceAllocationStrategy resource_allocation_strategy,
TRTRuntimeConfig runtime_cfg) {
this->runtime_cfg = std::move(runtime_cfg);
TORCHTRT_CHECK(
is_supported_on_current_platform(target_platform),
"This engine was not built to run on this platform (built for: " << target_platform << ", current platform: "
Expand All @@ -111,15 +117,15 @@ TRTEngine::TRTEngine(
auto most_compatible_device = get_most_compatible_device(cuda_device, RTDevice(), hardware_compatible);
TORCHTRT_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");

this->serialized_metadata = serialized_metadata;
this->serialized_metadata = std::move(serialized_metadata);
this->requires_output_allocator = requires_output_allocator;
device_info = most_compatible_device.value();
multi_gpu_device_check();
set_rt_device(device_info);

rt = make_trt(nvinfer1::createInferRuntime(util::logging::get_logger()));

name = slugify(mod_name);
name = slugify(std::move(mod_name));

cuda_engine = make_trt(rt->deserializeCudaEngine(serialized_engine.c_str(), serialized_engine.size()));
TORCHTRT_CHECK((cuda_engine.get() != nullptr), "Unable to deserialize the TensorRT engine");
Expand All @@ -134,13 +140,7 @@ TRTEngine::TRTEngine(
LOG_DEBUG(
"Resource allocation strategy: "
<< (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "Dynamic" : "Static"));
if (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic) {
this->exec_ctx =
make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
} else {
this->exec_ctx = make_trt(cuda_engine->createExecutionContext());
}
TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context");
recreate_execution_context();

// Pre-allocate placeholder for empty tensors (TensorRT requires non-null addresses)
cudaMalloc(&empty_tensor_placeholder, 1);
Expand Down Expand Up @@ -265,6 +265,9 @@ TRTEngine::TRTEngine(

TRTEngine::~TRTEngine() {
torch::cuda::synchronize(device_info.id);
// Marked noexcept by the type system, so safe to invoke from a destructor without
// explicit try/catch; any I/O error is logged internally.
runtime_cfg.save_runtime_cache();
trt_engine_profiler.reset();
exec_ctx.reset();
cuda_engine.reset();
Expand All @@ -278,8 +281,7 @@ void TRTEngine::disable_profiling() {
torch::cuda::synchronize(device_info.id);
profile_execution = false;
trt_engine_profiler.reset();
exec_ctx = make_trt(cuda_engine->createExecutionContext());
TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to recreate TensorRT execution context");
recreate_execution_context();
Comment thread
tp5uiuc marked this conversation as resolved.
}

void TRTEngine::dump_engine_layer_info_to_file(const std::string& path) {
Expand Down Expand Up @@ -376,10 +378,7 @@ bool TRTEngine::set_device_memory_budget(int64_t budget) {
trt_engine_profiler.reset();
}
bool result = cuda_engine->setWeightStreamingBudgetV2(budget);
exec_ctx = make_trt(cuda_engine->createExecutionContext());
TORCHTRT_CHECK(
(exec_ctx.get() != nullptr),
"Unable to recreate TensorRT execution context after setting new device memory budget");
recreate_execution_context();
Comment thread
tp5uiuc marked this conversation as resolved.
if (profile_execution) {
enable_profiling();
}
Expand Down Expand Up @@ -428,6 +427,7 @@ std::string TRTEngine::to_str() const {
ss << " Hardware Compatibility: " << (hardware_compatible ? "Enabled" : "Disabled") << std::endl;
ss << " Target Platform: " << target_platform << std::endl;
ss << " Resource Allocation Strategy: " << (resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "Dynamic" : "Static") << std::endl;
ss << runtime_cfg.to_str();
// clang-format on
return ss.str();
}
Expand Down Expand Up @@ -472,7 +472,14 @@ FlattenedState TRTEngine::__obj_flatten__() {
std::tuple("serialized_metadata", serialized_info[SERIALIZED_METADATA_IDX]),
std::tuple("requires_output_allocator", serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX]),
std::tuple("target_platform", serialized_info[TARGET_PLATFORM_IDX]),
std::tuple("resource_allocation_strategy", serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX]));
std::tuple("resource_allocation_strategy", serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX])
#ifdef TRT_MAJOR_RTX
,
std::tuple("runtime_cache_path", serialized_info[RUNTIME_CACHE_PATH_IDX]),
std::tuple("dynamic_shapes_kernel_strategy", serialized_info[DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX]),
std::tuple("cuda_graph_strategy", serialized_info[CUDA_GRAPH_STRATEGY_IDX])
#endif
);
}

std::vector<std::string> TRTEngine::serialize() {
Expand All @@ -497,6 +504,13 @@ std::vector<std::string> TRTEngine::serialize() {
serialized_info[TARGET_PLATFORM_IDX] = this->target_platform.serialize();
serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX] =
this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "1" : "0";
#ifdef TRT_MAJOR_RTX
serialized_info[RUNTIME_CACHE_PATH_IDX] = runtime_cfg.runtime_cache_path;
serialized_info[DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX] = std::to_string(
static_cast<std::underlying_type_t<DynamicShapesKernelStrategy>>(runtime_cfg.dynamic_shapes_kernel_strategy));
serialized_info[CUDA_GRAPH_STRATEGY_IDX] =
std::to_string(static_cast<std::underlying_type_t<CudaGraphStrategyOption>>(runtime_cfg.cuda_graph_strategy));
#endif

return serialized_info;
}
Expand All @@ -508,17 +522,44 @@ void TRTEngine::reset_captured_graph() {
void TRTEngine::set_resource_allocation_strategy(TRTEngine::ResourceAllocationStrategy new_strategy) {
if (new_strategy != this->resource_allocation_strategy) {
this->resource_allocation_strategy = new_strategy;
if (this->resource_allocation_strategy == TRTEngine::ResourceAllocationStrategy::kDynamic) {
LOG_DEBUG("Setting resource allocation strategy to dynamic");
this->exec_ctx =
make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
} else {
LOG_DEBUG("Setting resource allocation strategy to static");
this->exec_ctx = make_trt(cuda_engine->createExecutionContext());
}
LOG_DEBUG(
"Setting resource allocation strategy to "
<< (this->resource_allocation_strategy == TRTEngine::ResourceAllocationStrategy::kDynamic ? "dynamic"
: "static"));
recreate_execution_context();
}
}

bool TRTEngine::is_monolithic_capturable(cudaStream_t stream) const {
return runtime_cfg.is_monolithic_capturable(exec_ctx.get(), stream);
}

void TRTEngine::disable_rtx_native_cudagraphs() {
bool was_disabled = runtime_cfg.rtx_native_cudagraphs_disabled;
runtime_cfg.disable_rtx_native_cudagraphs(name);
if (!was_disabled && runtime_cfg.rtx_native_cudagraphs_disabled) {
// The CUDA graph strategy on the IRuntimeConfig has been flipped; rebuild exec_ctx
// so the new strategy takes effect for subsequent enqueueV3 calls.
recreate_execution_context();
}
}

void TRTEngine::recreate_execution_context() {
// Flush any kernels the previous execution context may have compiled into the
// runtime cache before creating the replacement. The destructor also saves, but
// doing it here guards against losing compiled kernels across profiling toggles,
// allocator changes, or process kills that happen between allocator changes and
// teardown. No-op on standard TensorRT or when no cache path is configured.
runtime_cfg.save_runtime_cache();
runtime_cfg.ensure_initialized(cuda_engine.get());
Comment thread
tp5uiuc marked this conversation as resolved.
runtime_cfg.set_execution_context_allocation_strategy(
resource_allocation_strategy == ResourceAllocationStrategy::kDynamic
? nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED
: nvinfer1::ExecutionContextAllocationStrategy::kSTATIC);
exec_ctx = make_trt(cuda_engine->createExecutionContext(runtime_cfg.config.get()));
TORCHTRT_CHECK(exec_ctx.get() != nullptr, "Unable to (re)create TensorRT execution context");
}

} // namespace runtime
} // namespace core
} // namespace torch_tensorrt
44 changes: 36 additions & 8 deletions core/runtime/TRTEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "torch/custom_class.h"

#include "core/runtime/TRTEngineProfiler.h"
#include "core/runtime/TRTRuntimeConfig.h"
#include "core/util/prelude.h"

namespace torch_tensorrt {
Expand All @@ -30,7 +31,14 @@ using FlattenedState = std::tuple<
std::tuple<std::string, std::string>, // requires_output_allocator
std::tuple<std::string, std::string>, // serialized metadata
std::tuple<std::string, std::string>, // Platform
std::tuple<std::string, std::string>>; // Resource Allocation Strategy
std::tuple<std::string, std::string> // Resource Allocation Strategy
#ifdef TRT_MAJOR_RTX
,
std::tuple<std::string, std::string>, // Runtime Cache Path (TRT-RTX)
std::tuple<std::string, std::string>, // Dynamic Shapes Kernel Strategy (TRT-RTX)
std::tuple<std::string, std::string> // CUDA Graph Strategy (TRT-RTX)
#endif
>;

struct TorchTRTRuntimeStates {
// Indicates whether CUDAGraphs were enabled in the previous execute_engine
Expand Down Expand Up @@ -125,31 +133,33 @@ struct TRTEngine : torch::CustomClassHolder {

~TRTEngine();
TRTEngine(
const std::string& serialized_engine,
std::string serialized_engine,
const RTDevice& cuda_device,
const std::vector<std::string>& in_binding_names,
const std::vector<std::string>& out_binding_names,
const Platform& target_platform = get_current_platform(),
bool hardware_compatible = false,
bool requires_output_allocator = false,
const std::string& serialized_metadata = "",
std::string serialized_metadata = "",
const TRTEngine::ResourceAllocationStrategy resource_allocation_strategy =
TRTEngine::ResourceAllocationStrategy::kStatic);
TRTEngine::ResourceAllocationStrategy::kStatic,
TRTRuntimeConfig runtime_cfg = TRTRuntimeConfig{});

TRTEngine(std::vector<std::string> serialized_info);

TRTEngine(
const std::string& mod_name,
const std::string& serialized_engine,
std::string mod_name,
std::string serialized_engine,
const RTDevice& cuda_device,
const std::vector<std::string>& in_binding_names,
const std::vector<std::string>& out_binding_names,
const Platform& target_platform = get_current_platform(),
bool hardware_compatible = false,
bool requires_output_allocator = false,
const std::string& serialized_metadata = "",
std::string serialized_metadata = "",
const TRTEngine::ResourceAllocationStrategy resource_allocation_strategy =
TRTEngine::ResourceAllocationStrategy::kStatic);
TRTEngine::ResourceAllocationStrategy::kStatic,
TRTRuntimeConfig runtime_cfg = TRTRuntimeConfig{});

TRTEngine& operator=(const TRTEngine& other);
std::string to_str() const;
Expand Down Expand Up @@ -217,6 +227,24 @@ struct TRTEngine : torch::CustomClassHolder {
ResourceAllocationStrategy resource_allocation_strategy = kStatic;
void set_resource_allocation_strategy(ResourceAllocationStrategy new_strategy);
ResourceAllocationStrategy get_resource_allocation_strategy();

// All TensorRT-RTX-specific IRuntimeConfig state lives here. On non-RTX builds this
// still owns a shared IRuntimeConfig (so the execution-context allocation strategy is
// applied via the uniform code path) but the RTX-only setters become no-ops.
TRTRuntimeConfig runtime_cfg;

// Monolithic-capturability check used when this engine is wrapped by an outer whole-graph
// capture (e.g. CudaGraphsTorchTensorRTModule). Non-RTX builds always return true.
bool is_monolithic_capturable(cudaStream_t stream) const;

// Disable TensorRT-RTX native CUDA graph capture on this engine (one-shot, invoked when
// an outer stream capture is detected around execute_engine). No-op on non-RTX.
void disable_rtx_native_cudagraphs();

private:
// Single entry point that (re)creates exec_ctx. Also creates (once) the IRuntimeConfig
// owned by runtime_cfg and applies all runtime config settings.
void recreate_execution_context();
};

} // namespace runtime
Expand Down
Loading
Loading