TensorRT C++网络模型接口推理
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了TensorRT C++网络模型接口推理相关的知识,希望对你有一定的参考价值。
简述:
介绍tensorrt在c++中使用,并用网络模型推理
环境:
cuda11.3
cudnn8.2
Tensorrt8.4
内容:
1.读取已序列化的engine文件
std::ifstream file(engine_file_path, std::ios::binary);
if (file.good())
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
std::cout << "read engine ok" << std::endl;
else
std::cout << "read engine failed" << std::endl;
其中engine_file_path是已经序列化的engine文件,可以通过python的yolov5模型export文件转换
2.通过runtime对象将engine模型数据反序列化,并创建上下文
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
assert(engine != nullptr);
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
delete[] trtModelStream;
3.获取模型中保存的张量参数信息
const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
assert(engine->getBindingDataType(inputIndex) == nvinfer1::DataType::kFLOAT);
const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
assert(engine->getBindingDataType(outputIndex) == nvinfer1::DataType::kFLOAT);
int mBatchSize = engine->getMaxBatchSize();
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
auto out_dims = engine->getBindingDimensions(outputIndex);
其中,INPUT_BLOB_NAME和OUTPUT_BLOB_NAME是模型的输入层和输出层名称,通过python中export执行结果获取
4.给输入和输出分配 CUDA 设备内存
auto output_size = 1;
auto output1_size = 1;
for(int j=0;j<out_dims.nbDims;j++)
output_size *= out_dims.d[j];
static const int Num_box = out_dims.d[1];
static float* prob = new float[output_size];
static float* prob1;
static int _segWidth, _segHeight, _segChannels;
// Create GPU buffers on device
CHECK(cudaMalloc(&buffers[inputIndex], 3 * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], output_size * sizeof(float)));
5.使用上下文context执行推理
CHECK(cudaMemcpyAsync(buffers[engine.getBindingIndex(INPUT_BLOB_NAME)], input, 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
//context.executeV2(buffers);
context.enqueueV2(buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[engine.getBindingIndex(OUTPUT_BLOB_NAME)], output_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
if (engine_mode == "seg")
CHECK(cudaMemcpyAsync(output1, buffers[engine.getBindingIndex(OUTPUT_BLOB_NAME1)], output1_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
以上是关于TensorRT C++网络模型接口推理的主要内容,如果未能解决你的问题,请参考以下文章
TensorRT模型部署实战,英伟达平台,C++ 和python对比(包含源码)
TensorRT模型部署实战,英伟达平台,C++ 和python对比(包含源码)