Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 23 additions & 4 deletions core/runtime/TRTEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <cuda_runtime.h>
#include "NvInfer.h"
#include "c10/cuda/CUDACachingAllocator.h"
#include "c10/cuda/CUDAStream.h"
#include "torch/csrc/jit/frontend/function_schema_parser.h"
#include "torch/cuda.h"
Expand Down Expand Up @@ -60,6 +61,24 @@ void DynamicOutputAllocator::notifyShape(char const* tensorName, nvinfer1::Dims
shapes[tensorName] = dims;
}

void TRTEngine::clear_active_input_tensors() {
active_input_tensors.clear();
active_shape_tensor_values.clear();
}

void TRTEngine::reset_active_input_tensors() {
clear_active_input_tensors();
active_input_tensors.resize(num_io.first);
}

void TRTEngine::record_active_input_tensor_stream_usage(const c10::cuda::CUDAStream& stream) {
for (const auto& input : active_input_tensors) {
if (input.defined() && input.is_cuda() && input.has_storage() && input.numel() > 0) {
c10::cuda::CUDACachingAllocator::recordStream(input.storage().data_ptr(), stream);
}
}
}

TRTEngine::TRTEngine(
const std::string& serialized_engine,
const RTDevice& cuda_device,
Expand Down Expand Up @@ -193,9 +212,9 @@ TRTEngine::TRTEngine(

num_io = std::make_pair(inputs, outputs);
in_binding_names.resize(inputs);
input_buffers.resize(inputs);
cudagraph_input_staging_buffers.resize(inputs);
out_binding_names.resize(outputs);
output_buffers.resize(outputs);
cudagraph_output_staging_buffers.resize(outputs);
for (int64_t x = 0; x < cuda_engine->getNbIOTensors(); x++) {
std::string bind_name = cuda_engine->getIOTensorName(x);
if (cuda_engine->getTensorIOMode(bind_name.c_str()) == nvinfer1::TensorIOMode::kINPUT) {
Expand All @@ -207,7 +226,7 @@ TRTEngine::TRTEngine(
} else {
uint64_t inputs_size = _in_binding_names.size();
in_binding_names.resize(inputs_size);
input_buffers.resize(inputs_size);
cudagraph_input_staging_buffers.resize(inputs_size);
for (uint64_t pyt_idx = 0; pyt_idx < inputs_size; pyt_idx++) {
auto binding_name = _in_binding_names[pyt_idx];
// Check if the binding name provided is in the list of engine's bindings
Expand Down Expand Up @@ -237,7 +256,7 @@ TRTEngine::TRTEngine(

uint64_t outputs = _out_binding_names.size();
out_binding_names.resize(outputs);
output_buffers.resize(outputs);
cudagraph_output_staging_buffers.resize(outputs);
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this system also get used for the pre_allocated outputs / output allocator?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No

for (size_t pyt_idx = 0; pyt_idx < outputs; pyt_idx++) {
auto binding_name = _out_binding_names[pyt_idx];
// Check if the binding name provided is in the list of engine's bindings
Expand Down
18 changes: 16 additions & 2 deletions core/runtime/TRTEngine.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once
#include <filesystem>
#include <fstream>
#include <list>
#include <map>
#include <memory>
#include <mutex>
Expand Down Expand Up @@ -184,6 +185,12 @@ struct TRTEngine : torch::CustomClassHolder {
void set_pre_allocated_outputs(bool enable);
void set_output_tensors_as_unowned(bool enable);
bool are_output_tensors_unowned();
void clear_active_input_tensors();
void reset_active_input_tensors();
// Mark active input tensor allocations as used by a CUDA stream so the CUDA
// caching allocator does not recycle their storage while that stream may still
// access it.
void record_active_input_tensor_stream_usage(const c10::cuda::CUDAStream& stream);
TorchTRTRuntimeStates runtime_states;
friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
static const char BINDING_DELIM = '%';
Expand All @@ -196,8 +203,15 @@ struct TRTEngine : torch::CustomClassHolder {
at::cuda::CUDAGraph cudagraph = {};
at::cuda::CUDAStream engine_stream = c10::cuda::getDefaultCUDAStream();
at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
std::vector<at::Tensor> input_buffers = {};
std::vector<at::Tensor> output_buffers = {};
std::vector<at::Tensor> cudagraph_input_staging_buffers = {};
std::vector<at::Tensor> cudagraph_output_staging_buffers = {};

// Per-call formatted input buffers. In standard mode these are bound
// directly to TRT; in CUDA graph mode they are async-copy sources for
// persistent CUDA graph input staging buffers.
std::vector<at::Tensor> active_input_tensors = {};
std::list<std::vector<int64_t>> active_shape_tensor_values = {};

std::string shape_key = "None";
bool use_pre_allocated_outputs = false;
std::vector<at::Tensor> pre_allocated_outputs;
Expand Down
61 changes: 32 additions & 29 deletions core/runtime/execute_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,8 @@ void setup_input_tensors(
std::vector<at::Tensor> inputs,
c10::intrusive_ptr<TRTEngine> compiled_engine,
bool cudagraphs_enabled,
bool need_cudagraphs_record,
std::list<std::vector<int64_t>>& inputShapeTensorValues) {
std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
bool need_cudagraphs_record) {
compiled_engine->reset_active_input_tensors();

for (size_t i = 0; i < inputs.size(); i++) {
std::string name = compiled_engine->in_binding_names[i];
Expand All @@ -124,39 +123,40 @@ void setup_input_tensors(
auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
std::vector<int64_t> inputs_cpu_vec(
input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
inputShapeTensorValues.emplace_back(inputs_cpu_vec);
compiled_engine->active_shape_tensor_values.emplace_back(std::move(inputs_cpu_vec));
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
compiled_engine->exec_ctx->setTensorAddress(
name.c_str(), compiled_engine->active_shape_tensor_values.back().data()),
"Error while setting the tensor address for shape inputs");

if (cudagraphs_enabled) {
// @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
compiled_engine->input_buffers[i] = input_cpu;
compiled_engine->cudagraph_input_staging_buffers[i] = input_cpu;
}
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
compiled_engine->exec_ctx->setTensorAddress(
name.c_str(), compiled_engine->active_shape_tensor_values.back().data()),
"Error while setting the tensor address for shape inputs");

} else {
at::Tensor contig_input = inputs[i].view(shape).contiguous();
formatted_inputs.emplace_back(std::move(contig_input));
compiled_engine->active_input_tensors[i] = inputs[i].view(shape).contiguous();

if (need_cudagraphs_record) {
// Create a new persistent input buffer
compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
// Create a persistent CUDA graph input staging buffer with a stable replay address.
compiled_engine->cudagraph_input_staging_buffers[i] = compiled_engine->active_input_tensors[i].clone();
}

TORCHTRT_CHECK(
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");

at::Tensor final_input;
if (cudagraphs_enabled) {
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
final_input = compiled_engine->input_buffers[i];
// If using CUDAGraphs copy formatted input to the corresponding persistent staging buffer.
compiled_engine->cudagraph_input_staging_buffers[i].copy_(compiled_engine->active_input_tensors[i], true);
final_input = compiled_engine->cudagraph_input_staging_buffers[i];
} else {
// Otherwise use the formatted buffer directly
final_input = formatted_inputs.back();
final_input = compiled_engine->active_input_tensors[i];
}

// Get tensor address, using placeholder for empty tensors
Expand Down Expand Up @@ -252,9 +252,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr

std::vector<at::Tensor> outputs(compiled_engine->num_io.second);

// Shape tensor CPU buffers must outlive inferShapes() and enqueueV3()
std::list<std::vector<int64_t>> inputShapeTensorValues;

// Intialize inputs and outputs to be available throughout the succeeding scopes
{ // Input Setup
std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
Expand All @@ -263,7 +260,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
}

setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record, inputShapeTensorValues);
setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
// Check if input shapes can be inferred.
int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
std::vector<char const*> names(io_size);
Expand Down Expand Up @@ -291,14 +288,14 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
auto pyt_idx = output_indices.second;
std::string name = compiled_engine->out_binding_names[pyt_idx];
if (need_cudagraphs_record) {
// If we are recording the cuda graph then we need to update the persistent output buffer
compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
// If recording a CUDA graph, update the persistent output staging buffer.
compiled_engine->cudagraph_output_staging_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
}

if (cudagraphs_enabled) {
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(
name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
name.c_str(), compiled_engine->cudagraph_output_staging_buffers[pyt_idx].data_ptr()),
"Error while setting the output tensor address");
} else {
TORCHTRT_CHECK(
Expand All @@ -321,6 +318,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
compiled_engine->engine_stream = c10::cuda::getStreamFromPool(false, current_device_id);
}

compiled_engine->record_active_input_tensor_stream_usage(
cudagraphs_enabled ? compiled_engine->caller_stream : compiled_engine->engine_stream);

{ // Engine Execution (execute on engine stream)
c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);

Expand Down Expand Up @@ -356,6 +356,8 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
}
} // End engine exeuction (resets to caller stream)

compiled_engine->clear_active_input_tensors();

// When the pre-allocated output mode is turned on, for intermediate modules, we only create the output in the first
// execution or when shape is changed.
if (compiled_engine->use_pre_allocated_outputs &&
Expand All @@ -370,9 +372,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
trt_exec_complete.block(compiled_engine->caller_stream);

if (cudagraphs_enabled) {
// If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
outputs[o].copy_(compiled_engine->output_buffers[o], false);
// If in CUDAGraph mode, copy persistent staging outputs to returned tensors on the caller stream.
for (size_t o = 0; o < compiled_engine->cudagraph_output_staging_buffers.size(); o++) {
outputs[o].copy_(compiled_engine->cudagraph_output_staging_buffers[o], false);
}
}

Expand All @@ -386,17 +388,14 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
};

auto run_output_allocator = [&]() {
// Shape tensor CPU buffers must outlive inferShapes() and enqueueV3()
std::list<std::vector<int64_t>> inputShapeTensorValues;

{ // Input Setup
std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
if (compiled_engine->profile_execution) {
input_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
}

setup_input_tensors(inputs, compiled_engine, false, false, inputShapeTensorValues);
setup_input_tensors(inputs, compiled_engine, false, false);
// Check if input shapes can be inferred.
int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
std::vector<char const*> names(io_size);
Expand Down Expand Up @@ -430,6 +429,8 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
compiled_engine->engine_stream = c10::cuda::getStreamFromPool(false, current_device_id);
}

compiled_engine->record_active_input_tensor_stream_usage(compiled_engine->engine_stream);

{ // Engine Execution (execute on engine stream)
c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);

Expand All @@ -449,6 +450,8 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr

} // End engine exeuction (resets to caller stream)

compiled_engine->clear_active_input_tensors();

// Block caller stream until engine execution is complete
at::cuda::CUDAEvent trt_exec_complete;
trt_exec_complete.record(compiled_engine->engine_stream);
Expand Down
Loading
Loading