TensorRT 系列 动态shape
Posted 洪流之源
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了TensorRT 系列 动态shape相关的知识,希望对你有一定的参考价值。
TensorRT支持输入动态shape的推理,在编译模型时可以指定shape的动态范围为[L, H],推理时允许L <= shape <= H,输入动态shape可通过createOptimizationProfile优化配置文件,来指定输入的shape可以变换的范围,当然也可以通过ONNX导出模型时进行指定,本次只演示前一种。
示例代码:
// tensorRT include
#include <NvInfer.h>
#include <NvInferRuntime.h>
// cuda include
#include <cuda_runtime.h>
// system include
#include <stdio.h>
#include <math.h>
#include <iostream>
#include <fstream> // 后面要用到ios这个库
#include <vector>
using namespace std;
class TRTLogger : public nvinfer1::ILogger
public:
virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override
if(severity <= Severity::kINFO)
printf("%d: %s\\n", severity, msg);
logger;
nvinfer1::Weights make_weights(float* ptr, int n)
nvinfer1::Weights w;
w.count = n;
w.type = nvinfer1::DataType::kFLOAT;
w.values = ptr;
return w;
bool build_model()
TRTLogger logger;
// ----------------------------- 1. 定义 builder, config 和network -----------------------------
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
nvinfer1::INetworkDefinition* network = builder->createNetworkV2(1);
// 构建一个模型
/*
Network definition:
image
|
conv(3x3, pad=1) input = 1, output = 1, bias = True w=[[1.0, 2.0, 0.5], [0.1, 0.2, 0.5], [0.2, 0.2, 0.1]], b=0.0
|
relu
|
prob
*/
// ----------------------------- 2. 输入,模型结构和输出的基本信息 -----------------------------
const int num_input = 1;
const int num_output = 1;
float layer1_weight_values[] =
1.0, 2.0, 3.1,
0.1, 0.1, 0.1,
0.2, 0.2, 0.2
; // 行优先
float layer1_bias_values[] = 0.0;
// 如果要使用动态shape,必须让NetworkDefinition的维度定义为-1,in_channel是固定的
nvinfer1::ITensor* input = network->addInput("image", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4(-1, num_input, -1, -1));
nvinfer1::Weights layer1_weight = make_weights(layer1_weight_values, 9);
nvinfer1::Weights layer1_bias = make_weights(layer1_bias_values, 1);
auto layer1 = network->addConvolution(*input, num_output, nvinfer1::DimsHW(3, 3), layer1_weight, layer1_bias);
layer1->setPadding(nvinfer1::DimsHW(1, 1));
auto prob = network->addActivation(*layer1->getOutput(0), nvinfer1::ActivationType::kRELU); // *(layer1->getOutput(0))
// 将我们需要的prob标记为输出
network->markOutput(*prob->getOutput(0));
int maxBatchSize = 10;
printf("Workspace Size = %.2f MB\\n", (1 << 28) / 1024.0f / 1024.0f);
// 配置暂存存储器,用于layer实现的临时存储,也用于保存中间激活值
config->setMaxWorkspaceSize(1 << 28);
// --------------------------------- 2.1 关于profile ----------------------------------
// 如果模型有多个输入,则必须多个profile
auto profile = builder->createOptimizationProfile();
// 配置最小允许1 x 1 x 3 x 3
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4(1, num_input, 3, 3));
// 配置最优配置允许1 x 1 x 3 x 3
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4(1, num_input, 3, 3));
// 配置最大允许10 x 1 x 5 x 5
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4(maxBatchSize, num_input, 5, 5));
config->addOptimizationProfile(profile);
nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
if(engine == nullptr)
printf("Build engine failed.\\n");
return false;
// -------------------------- 3. 序列化 ----------------------------------
// 将模型序列化,并储存为文件
nvinfer1::IHostMemory* model_data = engine->serialize();
FILE* f = fopen("engine.trtmodel", "wb");
fwrite(model_data->data(), 1, model_data->size(), f);
fclose(f);
// 卸载顺序按照构建顺序倒序
model_data->destroy();
engine->destroy();
network->destroy();
config->destroy();
builder->destroy();
printf("Done.\\n");
return true;
vector<unsigned char> load_file(const string& file)
ifstream in(file, ios::in | ios::binary);
if (!in.is_open())
return ;
in.seekg(0, ios::end);
size_t length = in.tellg();
std::vector<uint8_t> data;
if (length > 0)
in.seekg(0, ios::beg);
data.resize(length);
in.read((char*)&data[0], length);
in.close();
return data;
void inference()
// ------------------------------- 1. 加载model并反序列化 -------------------------------
TRTLogger logger;
auto engine_data = load_file("engine.trtmodel");
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
if(engine == nullptr)
printf("Deserialize cuda engine failed.\\n");
runtime->destroy();
return;
nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();
cudaStream_t stream = nullptr;
cudaStreamCreate(&stream);
/*
Network definition:
image
|
conv(3x3, pad=1) input = 1, output = 1, bias = True w=[[1.0, 2.0, 0.5], [0.1, 0.2, 0.5], [0.2, 0.2, 0.1]], b=0.0
|
relu
|
prob
*/
// ------------------------------- 2. 输入与输出 -------------------------------
float input_data_host[] =
// batch 0
1, 1, 1,
1, 1, 1,
1, 1, 1,
// batch 1
-1, 1, 1,
1, 0, 1,
1, 1, -1
;
float* input_data_device = nullptr;
// 3x3输入,对应3x3输出
int ib = 2;
int iw = 3;
int ih = 3;
float output_data_host[ib * iw * ih];
float* output_data_device = nullptr;
cudaMalloc(&input_data_device, sizeof(input_data_host));
cudaMalloc(&output_data_device, sizeof(output_data_host));
cudaMemcpyAsync(input_data_device, input_data_host, sizeof(input_data_host), cudaMemcpyHostToDevice, stream);
// ------------------------------- 3. 推理 -------------------------------
// 明确当前推理时,使用的数据输入大小
execution_context->setBindingDimensions(0, nvinfer1::Dims4(ib, 1, ih, iw));
float* bindings[] = input_data_device, output_data_device;
bool success = execution_context->enqueueV2((void**)bindings, stream, nullptr);
cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
// ------------------------------- 4. 输出结果 -------------------------------
for(int b = 0; b < ib; ++b)
printf("batch %d. output_data_host = \\n", b);
for(int i = 0; i < iw * ih; ++i)
printf("%f, ", output_data_host[b * iw * ih + i]);
if((i + 1) % iw == 0)
printf("\\n");
printf("Clean memory\\n");
cudaStreamDestroy(stream);
cudaFree(input_data_device);
cudaFree(output_data_device);
execution_context->destroy();
engine->destroy();
runtime->destroy();
int main()
if(!build_model())
return -1;
inference();
return 0;
Makefile:
cc := g++
name := pro
workdir := workspace
srcdir := src
objdir := objs
stdcpp := c++11
cuda_home := /home/liuhongyuan/miniconda3/envs/trtpy/lib/python3.8/site-packages//trtpy/trt8cuda112cudnn8
syslib := /home/liuhongyuan/miniconda3/envs/trtpy/lib/python3.8/site-packages//trtpy/lib
cpp_pkg := /home/liuhongyuan/miniconda3/envs/trtpy/lib/python3.8/site-packages//trtpy/cpp-packages
cuda_arch :=
nvcc := $(cuda_home)/bin/nvcc -ccbin=$(cc)
# 定义cpp的路径查找和依赖项mk文件
cpp_srcs := $(shell find $(srcdir) -name "*.cpp")
cpp_objs := $(cpp_srcs:.cpp=.cpp.o)
cpp_objs := $(cpp_objs:$(srcdir)/%=$(objdir)/%)
cpp_mk := $(cpp_objs:.cpp.o=.cpp.mk)
# 定义cu文件的路径查找和依赖项mk文件
cu_srcs := $(shell find $(srcdir) -name "*.cu")
cu_objs := $(cu_srcs:.cu=.cu.o)
cu_objs := $(cu_objs:$(srcdir)/%=$(objdir)/%)
cu_mk := $(cu_objs:.cu.o=.cu.mk)
# 定义opencv和cuda需要用到的库文件
link_cuda := cudart cudnn
link_trtpro :=
link_tensorRT := nvinfer
link_opencv :=
link_sys := stdc++ dl
link_librarys := $(link_cuda) $(link_tensorRT) $(link_sys) $(link_opencv)
# 定义头文件路径,请注意斜杠后边不能有空格
# 只需要写路径,不需要写-I
include_paths := src \\
$(cuda_home)/include/cuda \\
$(cuda_home)/include/tensorRT \\
$(cpp_pkg)/opencv4.2/include
# 定义库文件路径,只需要写路径,不需要写-L
library_paths := $(cuda_home)/lib64 $(syslib) $(cpp_pkg)/opencv4.2/lib
# 把library path给拼接为一个字符串,例如a b c => a:b:c
# 然后使得LD_LIBRARY_PATH=a:b:c
empty :=
library_path_export := $(subst $(empty) $(empty),:,$(library_paths))
# 把库路径和头文件路径拼接起来成一个,批量自动加-I、-L、-l
run_paths := $(foreach item,$(library_paths),-Wl,-rpath=$(item))
include_paths := $(foreach item,$(include_paths),-I$(item))
library_paths := $(foreach item,$(library_paths),-L$(item))
link_librarys := $(foreach item,$(link_librarys),-l$(item))
# 如果是其他显卡,请修改-gencode=arch=compute_75,code=sm_75为对应显卡的能力
# 显卡对应的号码参考这里:https://developer.nvidia.com/zh-cn/cuda-gpus#compute
# 如果是 jetson nano,提示找不到-m64指令,请删掉 -m64选项。不影响结果
cpp_compile_flags := -std=$(stdcpp) -w -g -O0 -m64 -fPIC -fopenmp -pthread
cu_compile_flags := -std=$(stdcpp) -w -g -O0 -m64 $(cuda_arch) -Xcompiler "$(cpp_compile_flags)"
link_flags := -pthread -fopenmp -Wl,-rpath='$$ORIGIN'
cpp_compile_flags += $(include_paths)
cu_compile_flags += $(include_paths)
link_flags += $(library_paths) $(link_librarys) $(run_paths)
# 如果头文件修改了,这里的指令可以让他自动编译依赖的cpp或者cu文件
ifneq ($(MAKECMDGOALS), clean)
-include $(cpp_mk) $(cu_mk)
endif
$(name) : $(workdir)/$(name)
all : $(name)
run : $(name)
@cd $(workdir) && ./$(name) $(run_args)
$(workdir)/$(name) : $(cpp_objs) $(cu_objs)
@echo Link $@
@mkdir -p $(dir $@)
@$(cc) $^ -o $@ $(link_flags)
$(objdir)/%.cpp.o : $(srcdir)/%.cpp
@echo Compile CXX $<
@mkdir -p $(dir $@)
@$(cc) -c $< -o $@ $(cpp_compile_flags)
$(objdir)/%.cu.o : $(srcdir)/%.cu
@echo Compile CUDA $<
@mkdir -p $(dir $@)
@$(nvcc) -c $< -o $@ $(cu_compile_flags)
# 编译cpp依赖项,生成mk文件
$(objdir)/%.cpp.mk : $(srcdir)/%.cpp
@echo Compile depends C++ $<
@mkdir -p $(dir $@)
@$(cc) -M $< -MF $@ -MT $(@:.cpp.mk=.cpp.o) $(cpp_compile_flags)
# 编译cu文件的依赖项,生成cumk文件
$(objdir)/%.cu.mk : $(srcdir)/%.cu
@echo Compile depends CUDA $<
@mkdir -p $(dir $@)
@$(nvcc) -M $< -MF $@ -MT $(@:.cu.mk=.cu.o) $(cu_compile_flags)
# 定义清理指令
clean :
@rm -rf $(objdir) $(workdir)/$(name) $(workdir)/*.trtmodel
# 防止符号被当做文件
.PHONY : clean run $(name)
# 导出依赖库路径,使得能够运行起来
export LD_LIBRARY_PATH:=$(library_path_export)
以上是关于TensorRT 系列 动态shape的主要内容,如果未能解决你的问题,请参考以下文章