从零开始 TensorRT（7）C++ 篇：解析 ONNX

前言

示例

参考源码：cookbook → 04-BuildEngineByONNXParser → pyTorch-ONNX-TensorRT

源码

C++ 代码量较多，已上传 GitHub
OpenCV 安装：

apt install libopencv-dev

（1）按 Python 篇中的方式将 RenNet-18 转为 ONNX

python generate_onnx.py

（2）编译运行

mkdir build
cd build
cmake ..
make
cd ../bin

./demo
./demo --fp16
./demo --int8

解析

在 cookbook 中，createCalibrationAndInferenceData.py 将 MNIST 数据存储为 npz 文件，并在 C++ 部分直接读取 Numpy 文件中的数据用于推理和校正，避免了图片解码的相关代码。
本文示例依然是参考 cookbook，使用 ResNet 进行推理。将读取 Numpy 文件的部分改为读取本地图像，并利用 OpenCV 对图像进行预处理，Int8 模式中的校正器部分代码也有所改动。

（1）预处理

std::vector<float> loadImg(const std::string filename, int width, int height, int channel) {
    cv::Mat image = cv::imread(filename, cv::IMREAD_COLOR);
    if (image.empty()) {
        std::cerr << "Error: Unable to read image file." << std::endl;
        return std::vector<float>();
    }

    cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
    cv::resize(image, image, cv::Size(width, height));
    image.convertTo(image, CV_32F, 1.0 / 255.0);
    cv::Scalar meanData(0.485, 0.456, 0.406);
    cv::Scalar stdData(0.229, 0.224, 0.225);
    cv::subtract(image, meanData, image);
    cv::divide(image, stdData, image);
	"上面图像读取、resize、归一化、标准化都是调用 OpenCV API 与 Python 代码大同小异"
	
	"下面是对数组维度进行调整: (h,w,3)->(3,h,w)"
	"这里先把图像拆分成三个通道, 依次将三通道中的数据放到data中"
	"通常会直接对图像数据进行遍历放到data中, 效率应该更高"
    std::vector<cv::Mat> channels;
    cv::split(image, channels);

    std::vector<float> data(channel * height * width);
    int idx = 0;
    for (int c = 0; c < channel; ++c) {
        for (int h = 0; h < height; ++h) {
            for (int w = 0; w < width; ++w) {
                data[idx++] = channels[c].at<float>(h, w);
            }
        }
    }
    return data;
}

（2）校准器

"主要是构造函数和 getBatch 与 cookbook 有所不同"
"先看原版"
"这里与 Python 篇中的校准器有所不同"
"Python: 在所有校准数据中随机抽样 batchsize 个循环校正 nCalibration 次"
"C++: 在所有校准数据中依次获取 batchsize 个, 直到剩余数据不足一个 batch, nCalibration 参数并没有用到"
MyCalibrator::MyCalibrator(const std::string &calibrationDataFile, const int nCalibration, const Dims32 dim, const std::string &cacheFile):
    nCalibration(nCalibration), dim(dim), cacheFile(cacheFile), iBatch(0)
{
    cnpy::npz_t    npzFile = cnpy::npz_load(calibrationDataFile);
    cnpy::NpyArray array   = npzFile[std::string("calibrationData")];
    pData                  = array.data<float>();
    if (pData == nullptr)
    {
        std::cout << "Failed getting calibration data!" << std::endl;
        return;
    }
	"nBatch 代替 nCalibration"
    nBatch   = array.num_bytes() / bufferSize;  "此处源码明显有误, 应该在 bufferSize 计算之后"
    "nElement 计算数组中元素个数, 即 c*h*w"
    nElement = 1;
    for (int i = 0; i < dim.nbDims; ++i)
    {
        nElement *= dim.d[i];
    }
    "bufferSize 为数组空间大小"
    bufferSize = sizeof(float) * nElement;
    cudaMalloc((void **)&bufferD, bufferSize);

    return;
}

bool MyCalibrator::getBatch(void *bindings[], char const *names[], int32_t nbBindings) noexcept
{
    if (iBatch < nBatch)
    {
        cudaMemcpy(bufferD, &pData[iBatch * nElement], bufferSize, cudaMemcpyHostToDevice);
        bindings[0] = bufferD;
        iBatch++;
        return true;
    }
    else
    {
        return false;
    }
}

"本文示例将 calibrationDataDir 文件夹内的图像文件作为校准数据, 代替 cookbook 中的 Numpy 数据"
MyCalibrator::MyCalibrator(const std::string &calibrationDataDir, const int nCalibration, const Dims32 dim, const std::string &cacheFile):
    nCalibration(nCalibration), dim(dim), cacheFile(cacheFile), iBatch(0) {
	"range-based loop, 用于遍历容器或其他可迭代对象中元素的循环结构"
	"与 Python 中的循环类似 for entry in os.listdir(dir)"
	"const: 变量只读"
	"auto: 自动推导类型"
	"&: 引用, 避免拷贝"
	"fs::directory_iterator: C++17中<filesystem>提供的功能"
    for (const auto& entry : fs::directory_iterator(calibrationDataDir)) {
        if (fs::is_regular_file(entry)) {
            files.push_back(entry.path().string());
        }
    }

    nBatch = files.size() / dim.d[0];
    nElement = 1;
    for (int i = 0; i < dim.nbDims; ++i) {
        nElement *= dim.d[i];
    }
    bufferSize = sizeof(float) * nElement;
    cudaMalloc((void **)&bufferD, bufferSize);

    return;
}

bool MyCalibrator::getBatch(void* bindings[], char const* names[], int32_t nbBindings) noexcept {
    if (iBatch < nBatch) {
        for (int i = 0; i < dim.d[0]; ++i) {
        	"逐个读取图像, 并把数据拷贝到 bufferD 中对应位置"
            std::vector<float> img = loadImg(files[iBatch*dim.d[0]+i], dim.d[3], dim.d[2], dim.d[1]);
            cudaMemcpy(&bufferD[i*img.size()], img.data(), img.size()*sizeof(float), cudaMemcpyHostToDevice);
        }
        bindings[0] = bufferD;
        iBatch++;
        return true;
    }
    else {
        return false;
    }
}

一个奇怪的 Bug

在 int8 模式下，最初设置校正时 BatchSize 为1 calibrationBatchSize {1};，常见输入 BatchSize 为 4 profile->setDimensions(inputTensor->getName(), OptProfileSelector::kOPT, Dims32 {4, {4, nChannel, nHeight, nWidth}}); 时出现如下报错

Succeeded parsing .onnx file!
Failed finding cache file!
ERROR: 1: [calibrator.cpp::add::793] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
ERROR: 1: [executionContext.cpp::commonEmitDebugTensor::1855] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
ERROR: 1: [resizingAllocator.cpp::deallocate::105] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
...
ERROR: 1: [resizingAllocator.cpp::deallocate::105] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
ERROR: 3: [engine.cpp::~Engine::298] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/engine.cpp::~Engine::298, condition: mExecutionContextCounter.use_count() == 1. Destroying an engine object before destroying the IExecutionContext objects it created leads to undefined behavior.
)
ERROR: 1: [cudaDriverHelpers.cpp::operator()::94] Error Code 1: Cuda Driver (an illegal memory access was encountered)
ERROR: 1: [cudaResources.cpp::~ScopedCudaStream::47] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
ERROR: 2: [calibrator.cpp::calibrateEngine::1181] Error Code 2: Internal Error (Assertion context->executeV2(&bindings[0]) failed. )
Failed building serialized engine!

但是反复检查代码感觉没有非法的内存访问，偶然对常用输入的 BatchSize 修改后发现代码能跑通，便做了如下测试。按理来说 optBatchSize 和校正时的 BatchSize 没什么关系，多半是 TensorRT 内部的 Bug。

秋山丶雪绪