pytorch · SandSnip3r · May 11, 2026 · narendasan · May 11, 2026 · SandSnip3r
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -2,6 +2,7 @@
 
 #include <cuda_runtime.h>
 #include "NvInfer.h"
+#include "c10/cuda/CUDACachingAllocator.h"
 #include "c10/cuda/CUDAStream.h"
 #include "torch/csrc/jit/frontend/function_schema_parser.h"
 #include "torch/cuda.h"
@@ -60,6 +61,24 @@ void DynamicOutputAllocator::notifyShape(char const* tensorName, nvinfer1::Dims
   shapes[tensorName] = dims;
 }
 
+void TRTEngine::clear_active_input_tensors() {
+  active_input_tensors.clear();
+  active_shape_tensor_values.clear();
+}
+
+void TRTEngine::reset_active_input_tensors() {
+  clear_active_input_tensors();
+  active_input_tensors.resize(num_io.first);
+}
+
+void TRTEngine::record_active_input_tensor_stream_usage(const c10::cuda::CUDAStream& stream) {
+  for (const auto& input : active_input_tensors) {
+    if (input.defined() && input.is_cuda() && input.has_storage() && input.numel() > 0) {
+      c10::cuda::CUDACachingAllocator::recordStream(input.storage().data_ptr(), stream);
+    }
+  }
+}
+
 TRTEngine::TRTEngine(
     const std::string& serialized_engine,
     const RTDevice& cuda_device,
@@ -193,9 +212,9 @@ TRTEngine::TRTEngine(
 
     num_io = std::make_pair(inputs, outputs);
     in_binding_names.resize(inputs);
-    input_buffers.resize(inputs);
+    cudagraph_input_staging_buffers.resize(inputs);
     out_binding_names.resize(outputs);
-    output_buffers.resize(outputs);
+    cudagraph_output_staging_buffers.resize(outputs);
     for (int64_t x = 0; x < cuda_engine->getNbIOTensors(); x++) {
       std::string bind_name = cuda_engine->getIOTensorName(x);
       if (cuda_engine->getTensorIOMode(bind_name.c_str()) == nvinfer1::TensorIOMode::kINPUT) {
@@ -207,7 +226,7 @@ TRTEngine::TRTEngine(
   } else {
     uint64_t inputs_size = _in_binding_names.size();
     in_binding_names.resize(inputs_size);
-    input_buffers.resize(inputs_size);
+    cudagraph_input_staging_buffers.resize(inputs_size);
     for (uint64_t pyt_idx = 0; pyt_idx < inputs_size; pyt_idx++) {
       auto binding_name = _in_binding_names[pyt_idx];
       // Check if the binding name provided is in the list of engine's bindings
@@ -237,7 +256,7 @@ TRTEngine::TRTEngine(
 
     uint64_t outputs = _out_binding_names.size();
     out_binding_names.resize(outputs);
-    output_buffers.resize(outputs);
+    cudagraph_output_staging_buffers.resize(outputs);
     for (size_t pyt_idx = 0; pyt_idx < outputs; pyt_idx++) {
       auto binding_name = _out_binding_names[pyt_idx];
       // Check if the binding name provided is in the list of engine's bindings

diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <filesystem>
 #include <fstream>
+#include <list>
 #include <map>
 #include <memory>
 #include <mutex>
@@ -184,6 +185,12 @@ struct TRTEngine : torch::CustomClassHolder {
   void set_pre_allocated_outputs(bool enable);
   void set_output_tensors_as_unowned(bool enable);
   bool are_output_tensors_unowned();
+  void clear_active_input_tensors();
+  void reset_active_input_tensors();
+  // Mark active input tensor allocations as used by a CUDA stream so the CUDA
+  // caching allocator does not recycle their storage while that stream may still
+  // access it.
+  void record_active_input_tensor_stream_usage(const c10::cuda::CUDAStream& stream);
   TorchTRTRuntimeStates runtime_states;
   friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
   static const char BINDING_DELIM = '%';
@@ -196,8 +203,15 @@ struct TRTEngine : torch::CustomClassHolder {
   at::cuda::CUDAGraph cudagraph = {};
   at::cuda::CUDAStream engine_stream = c10::cuda::getDefaultCUDAStream();
   at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
-  std::vector<at::Tensor> input_buffers = {};
-  std::vector<at::Tensor> output_buffers = {};
+  std::vector<at::Tensor> cudagraph_input_staging_buffers = {};
+  std::vector<at::Tensor> cudagraph_output_staging_buffers = {};
+
+  // Per-call formatted input buffers. In standard mode these are bound
+  // directly to TRT; in CUDA graph mode they are async-copy sources for
+  // persistent CUDA graph input staging buffers.
+  std::vector<at::Tensor> active_input_tensors = {};
+  std::list<std::vector<int64_t>> active_shape_tensor_values = {};
+
   std::string shape_key = "None";
   bool use_pre_allocated_outputs = false;
   std::vector<at::Tensor> pre_allocated_outputs;

diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -96,9 +96,8 @@ void setup_input_tensors(
     std::vector<at::Tensor> inputs,
     c10::intrusive_ptr<TRTEngine> compiled_engine,
     bool cudagraphs_enabled,
-    bool need_cudagraphs_record,
-    std::list<std::vector<int64_t>>& inputShapeTensorValues) {
-  std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
+    bool need_cudagraphs_record) {
+  compiled_engine->reset_active_input_tensors();
 
   for (size_t i = 0; i < inputs.size(); i++) {
     std::string name = compiled_engine->in_binding_names[i];
@@ -124,39 +123,40 @@ void setup_input_tensors(
       auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
       std::vector<int64_t> inputs_cpu_vec(
           input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
-      inputShapeTensorValues.emplace_back(inputs_cpu_vec);
+      compiled_engine->active_shape_tensor_values.emplace_back(std::move(inputs_cpu_vec));
       TORCHTRT_CHECK(
-          compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
+          compiled_engine->exec_ctx->setTensorAddress(
+              name.c_str(), compiled_engine->active_shape_tensor_values.back().data()),
           "Error while setting the tensor address for shape inputs");
 
       if (cudagraphs_enabled) {
         // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
-        compiled_engine->input_buffers[i] = input_cpu;
+        compiled_engine->cudagraph_input_staging_buffers[i] = input_cpu;
       }
       TORCHTRT_CHECK(
-          compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
+          compiled_engine->exec_ctx->setTensorAddress(
+              name.c_str(), compiled_engine->active_shape_tensor_values.back().data()),
           "Error while setting the tensor address for shape inputs");
 
     } else {
-      at::Tensor contig_input = inputs[i].view(shape).contiguous();
-      formatted_inputs.emplace_back(std::move(contig_input));
+      compiled_engine->active_input_tensors[i] = inputs[i].view(shape).contiguous();
 
       if (need_cudagraphs_record) {
-        // Create a new persistent input buffer
-        compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
+        // Create a persistent CUDA graph input staging buffer with a stable replay address.
+        compiled_engine->cudagraph_input_staging_buffers[i] = compiled_engine->active_input_tensors[i].clone();
       }
 
       TORCHTRT_CHECK(
           compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
 
       at::Tensor final_input;
       if (cudagraphs_enabled) {
-        // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
-        compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
-        final_input = compiled_engine->input_buffers[i];
+        // If using CUDAGraphs copy formatted input to the corresponding persistent staging buffer.
+        compiled_engine->cudagraph_input_staging_buffers[i].copy_(compiled_engine->active_input_tensors[i], true);
+        final_input = compiled_engine->cudagraph_input_staging_buffers[i];
       } else {
         // Otherwise use the formatted buffer directly
-        final_input = formatted_inputs.back();
+        final_input = compiled_engine->active_input_tensors[i];
       }
 
       // Get tensor address, using placeholder for empty tensors
@@ -252,9 +252,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
     std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
 
-    // Shape tensor CPU buffers must outlive inferShapes() and enqueueV3()
-    std::list<std::vector<int64_t>> inputShapeTensorValues;
-
     // Intialize inputs and outputs to be available throughout the succeeding scopes
     { // Input Setup
       std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
@@ -263,7 +260,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
             std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
       }
 
-      setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record, inputShapeTensorValues);
+      setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
       // Check if input shapes can be inferred.
       int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
       std::vector<char const*> names(io_size);
@@ -291,14 +288,14 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
         auto pyt_idx = output_indices.second;
         std::string name = compiled_engine->out_binding_names[pyt_idx];
         if (need_cudagraphs_record) {
-          // If we are recording the cuda graph then we need to update the persistent output buffer
-          compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
+          // If recording a CUDA graph, update the persistent output staging buffer.
+          compiled_engine->cudagraph_output_staging_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
         }
 
         if (cudagraphs_enabled) {
           TORCHTRT_CHECK(
               compiled_engine->exec_ctx->setTensorAddress(
-                  name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
+                  name.c_str(), compiled_engine->cudagraph_output_staging_buffers[pyt_idx].data_ptr()),
               "Error while setting the output tensor address");
         } else {
           TORCHTRT_CHECK(
@@ -321,6 +318,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       compiled_engine->engine_stream = c10::cuda::getStreamFromPool(false, current_device_id);
     }
 
+    compiled_engine->record_active_input_tensor_stream_usage(
+        cudagraphs_enabled ? compiled_engine->caller_stream : compiled_engine->engine_stream);
+
     { // Engine Execution (execute on engine stream)
       c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);
 
@@ -356,6 +356,8 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       }
     } // End engine exeuction (resets to caller stream)
 
+    compiled_engine->clear_active_input_tensors();
+
     // When the pre-allocated output mode is turned on, for intermediate modules, we only create the output in the first
     // execution or when shape is changed.
     if (compiled_engine->use_pre_allocated_outputs &&
@@ -370,9 +372,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     trt_exec_complete.block(compiled_engine->caller_stream);
 
     if (cudagraphs_enabled) {
-      // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
-      for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
-        outputs[o].copy_(compiled_engine->output_buffers[o], false);
+      // If in CUDAGraph mode, copy persistent staging outputs to returned tensors on the caller stream.
+      for (size_t o = 0; o < compiled_engine->cudagraph_output_staging_buffers.size(); o++) {
+        outputs[o].copy_(compiled_engine->cudagraph_output_staging_buffers[o], false);
       }
     }
 
@@ -386,17 +388,14 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   };
 
   auto run_output_allocator = [&]() {
-    // Shape tensor CPU buffers must outlive inferShapes() and enqueueV3()
-    std::list<std::vector<int64_t>> inputShapeTensorValues;
-
     { // Input Setup
       std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
       if (compiled_engine->profile_execution) {
         input_profiler_guard =
             std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
       }
 
-      setup_input_tensors(inputs, compiled_engine, false, false, inputShapeTensorValues);
+      setup_input_tensors(inputs, compiled_engine, false, false);
       // Check if input shapes can be inferred.
       int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
       std::vector<char const*> names(io_size);
@@ -430,6 +429,8 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       compiled_engine->engine_stream = c10::cuda::getStreamFromPool(false, current_device_id);
     }
 
+    compiled_engine->record_active_input_tensor_stream_usage(compiled_engine->engine_stream);
+
     { // Engine Execution (execute on engine stream)
       c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);
 
@@ -449,6 +450,8 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
     } // End engine exeuction (resets to caller stream)
 
+    compiled_engine->clear_active_input_tensors();
+
     // Block caller stream until engine execution is complete
     at::cuda::CUDAEvent trt_exec_complete;
     trt_exec_complete.record(compiled_engine->engine_stream);