当前位置: 代码迷 >> 综合 >> tensorrt 自定义层详解
  详细解决方案

tensorrt 自定义层详解

热度:58   发布时间:2023-10-21 21:55:29.0

tensorrt 自定义层类实现详解

  • 公用函数
    • 宏定义
    • 宏函数
    • 核相关定义
  • class PReLUPlugin详解
    • 私有成员
    • PReLUPlugin(const nvinfer1::Weights* weights, int nbWeight);
    • PReLUPlugin::PReLUPlugin(const void *data, size_t length)
    • virtual size_t getSerializationSize() const override
    • virtual void serialize(void* buffer) const override;
    • PReLUPlugin() = delete;
    • ~PReLUPlugin();
    • virtual int getNbOutputs() const override;
    • virtual nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override
    • virtual bool supportsFormat(const nvinfer1::DataType type, nvinfer1::PluginFormat format) const override;
    • virtual void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims,int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override;
    • virtual int initialize() override
    • virtual void terminate() override
    • virtual size_t getWorkspaceSize(int maxBatchSize) const override
    • virtual const char* getPluginType() const override
    • virtual const char* getPluginVersion() const override;
    • virtual void destroy()
    • virtual nvinfer1::IPluginV2* clone() const override;
    • virtual void setPluginNamespace(const char* pluginNamespace) override {}
    • virtual const char* getPluginNamespace() const override
    • virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
    • 核函数调用

公用函数

宏定义

static const char* G_PRELU_TYPE = "PReLU";
static const char* G_PRELU_NAME = "PReLU_TRT";

宏函数

#define CUDA_KERNEL_LOOP(i, n) \for (int i = blockIdx.x * blockDim.x + threadIdx.x; \i < (n); \i += blockDim.x * gridDim.x)

核相关定义

static const int CUDA_NUM_THREADS = 512;// CUDA: number of blocks for threads.
inline int CAFFE_GET_BLOCKS(const int N) {
    return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}

class PReLUPlugin详解

私有成员

输入的CHW,值,类型及GPU端拷贝

int mNbInputChannels, mNbInputHeight, mNbInputWidth;
nvinfer1::Weights mWeights;
nvinfer1::DataType mDataType{
    nvinfer1::DataType::kFLOAT};
void* mDeviceKernel{
    nullptr};

PReLUPlugin(const nvinfer1::Weights* weights, int nbWeight);

这个构造函数的作用是将此层的参数(权重,偏置等参数)读取到内部
将模型文件参数的值复制给私有成员

PReLUPlugin::PReLUPlugin(const nvinfer1::Weights *weights, int nbWeights) {
    mWeights = weights[0];mWeights.values = malloc(mWeights.count * type2size(mWeights.type));memcpy(const_cast<void *>(mWeights.values), weights[0].values, mWeights.count * type2size(mWeights.type));
}

PReLUPlugin::PReLUPlugin(const void *data, size_t length)

从序列化模型文件里读取数据
将参数的CHW,nvinfer1::Weights结构体内的变量copy到私有成员
最后assert内存大小是否相同的

PReLUPlugin::PReLUPlugin(const void *data, size_t length) {
    const char *d = static_cast<const char *>(data), *a = d;read<int>(d, mNbInputChannels);read<int>(d, mNbInputHeight);read<int>(d, mNbInputWidth);read<nvinfer1::DataType>(d, mDataType);read<int64_t>(d, mWeights.count);read<nvinfer1::DataType>(d, mWeights.type);mWeights.values = nullptr;mWeights.values = malloc(mWeights.count * type2size(mWeights.type));memcpy(const_cast<void *>(mWeights.values), d, mWeights.count * type2size(mWeights.type));d = d + mWeights.count * type2size(mWeights.type);ASSERT(d == a + length);

virtual size_t getSerializationSize() const override

返回下在buffer里占用的大小,其实就是读取序列化构造函数里面所有变量的内存大小

size_t PReLUPlugin::getSerializationSize() const {
    return sizeof(mNbInputChannels) + sizeof(mNbInputWidth) + sizeof(mNbInputHeight) + sizeof(mDataType) + sizeof(mWeights.count) + sizeof(mWeights.type) + mWeights.count * type2size(mWeights.type);
}

virtual void serialize(void* buffer) const override;

序列化插件到buffer 给构造函数相反一个读一个写,读写的参数相同

void PReLUPlugin::serialize(void *buffer) const {
    char *d = static_cast<char *>(buffer), *a = d;write(d, mNbInputChannels);write(d, mNbInputHeight);write(d, mNbInputWidth);write(d, mDataType);write(d, mWeights.count);write(d, mWeights.type);convertAndCopyToBuffer(d, mWeights, mWeights.type);ASSERT(d == a + getSerializationSize());
}

PReLUPlugin() = delete;

无参数构造函数无意义

~PReLUPlugin();

析构函数
释放你申请的空间防止段错误

PReLUPlugin::~PReLUPlugin() {
    if (mWeights.values) {
    free(const_cast<void *>(mWeights.values));mWeights.values = nullptr;}if (mDeviceKernel) {
    cudaFree(mDeviceKernel);mDeviceKernel = nullptr;}
}

virtual int getNbOutputs() const override;

返回输出tensor的数量, 比如说prelu,输出个数跟relu一样是1,这个取决于你的自定义层,大概是输入一个变成几个,待研究,后续补充

int PReLUPlugin::getNbOutputs() const {
    return 1;
}

virtual nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override

返回输出tensor的维度,根据输入维度和操作自行计算
index这边有疑问?待解决

nvinfer1::Dims PReLUPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) {
    if(index == 0) {
    return nvinfer1::Dims3(inputs[0].d[0],inputs[0].d[1],inputs[0].d[2]);} // else if(index == n) {
    // for other outputs if exists.// }else {
    ASSERT(false);}
}

virtual bool supportsFormat(const nvinfer1::DataType type, nvinfer1::PluginFormat format) const override;

看看你支持什么格式,half,int

bool PReLUPlugin::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const {
    return (type == nvinfer1::DataType::kFLOAT | type == nvinfer1::DataType::kHALF) && format == nvinfer1::PluginFormat::kNCHW;
}

virtual void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims,int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override;

判断下支持的格式,并给私有成员赋值

void PReLUPlugin::configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims, int nbOutputs,nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) {
    ASSERT((type == nvinfer1::DataType::kFLOAT | type == nvinfer1::DataType::kHALF)&& format == nvinfer1::PluginFormat::kNCHW);mNbInputChannels = inputDims[0].d[0]; mNbInputHeight = inputDims[0].d[1];mNbInputWidth = inputDims[0].d[2];mDataType = type;
}

virtual int initialize() override

初始化你的插件,其实就是初始化好gpu context, 将你的权重从内存拷贝到gpu上,如果设定了fp16,当然也要先做转化在拷贝到gpu

int PReLUPlugin::initialize() {
    convertAndCopyToDeivce(mDeviceKernel, mWeights, mDataType);return 0;
}

virtual void terminate() override

释放内存和显存,给析构函数一样,只不过调用对象不同

void PReLUPlugin::terminate() {
    if (mWeights.values){
    free(const_cast<void *>(mWeights.values));mWeights.values = nullptr;}if (mDeviceKernel){
    cudaFree(mDeviceKernel);mDeviceKernel = nullptr;}
}

virtual size_t getWorkspaceSize(int maxBatchSize) const override

很难解释, 直接返回0即可

size_t PReLUPlugin::getWorkspaceSize(int maxBatchSize) const
{
    return 0;
}

virtual const char* getPluginType() const override

tensorrt内部定义的

const char *PReLUPlugin::getPluginType() const {
    return G_PRELU_TYPE;
}

virtual const char* getPluginVersion() const override;

tensorrt内部定义的

const char *PReLUPlugin::getPluginVersion() const {
    return G_PLUGIN_VERSION;
}

virtual void destroy()

tensorrt内部接口来析构

void PReLUPlugin::destroy() {
    delete this; 
}

virtual nvinfer1::IPluginV2* clone() const override;

nvinfer1::IPluginV2* PReLUPlugin::clone() const {
    return new PReLUPlugin(&mWeights, 1);
}

virtual void setPluginNamespace(const char* pluginNamespace) override {}

不用实现留空

virtual const char* getPluginNamespace() const override

const char* PReLUPlugin::getPluginNamespace() const {
    return G_PLUGIN_NAMESPACE;
}

virtual int enqueue(int batchSize, const voidconst * inputs, void** outputs, void workspace, cudaStream_t stream) override

此层功能的代码实现,此处调用推理函数

int PReLUPlugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace, cudaStream_t stream)
{
    const int count = batchSize * mNbInputChannels * mNbInputWidth * mNbInputHeight;const int channels = mNbInputChannels;const int dim = mNbInputWidth * mNbInputHeight;const int div_factor = 1;if (mDataType == nvinfer1::DataType::kFLOAT){
    const float zerof{
    0.0f};CUDA_CHECK(Forward_gpu(count, channels, dim,reinterpret_cast<const float *>(mDeviceKernel),reinterpret_cast<const float *>(inputs[0]),reinterpret_cast<float *>(outputs[0]),zerof,div_factor,stream));} else {
    const __half zeroh = __half(0.0f);CUDA_CHECK(Forward_gpu(count, channels, dim,reinterpret_cast<const __half *>(mDeviceKernel),reinterpret_cast<const __half *>(inputs[0]),reinterpret_cast<__half *>(outputs[0]),zeroh,div_factor,stream));}return 0;
}

核函数调用

template <typename Ftype>
cudaError_t Forward_gpu(const int count, const int channels, const int dim,const Ftype* mDeviceKernel,const Ftype* bottom_data, Ftype* top_data, const Ftype zero,const int div_factor, const cudaStream_t stream) {
    PReLUForward<<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(count, channels, dim, mDeviceKernel, bottom_data, top_data, zero, div_factor);cudaError_t err = cudaGetLastError();return err;
}
template <typename Ftype>
__global__ void PReLUForward(const int n, const int channels, const int dim,const Ftype* slope_data,const Ftype* in, Ftype* out,const Ftype zero,const int div_factor) {
    CUDA_KERNEL_LOOP(index, n) {
    int c = (index / dim) % channels / div_factor;if(in[index] > zero) {
    out[index] = in[index];} else {
    out[index] = in[index] * slope_data[c];}}
}