tensorrt 自定义层类实现详解
- 公用函数
-
- 宏定义
- 宏函数
- 核相关定义
- class PReLUPlugin详解
-
- 私有成员
- PReLUPlugin(const nvinfer1::Weights* weights, int nbWeight);
- PReLUPlugin::PReLUPlugin(const void *data, size_t length)
- virtual size_t getSerializationSize() const override
- virtual void serialize(void* buffer) const override;
- PReLUPlugin() = delete;
- ~PReLUPlugin();
- virtual int getNbOutputs() const override;
- virtual nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override
- virtual bool supportsFormat(const nvinfer1::DataType type, nvinfer1::PluginFormat format) const override;
- virtual void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims,int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override;
- virtual int initialize() override
- virtual void terminate() override
- virtual size_t getWorkspaceSize(int maxBatchSize) const override
- virtual const char* getPluginType() const override
- virtual const char* getPluginVersion() const override;
- virtual void destroy()
- virtual nvinfer1::IPluginV2* clone() const override;
- virtual void setPluginNamespace(const char* pluginNamespace) override {}
- virtual const char* getPluginNamespace() const override
- virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
- 核函数调用
公用函数
宏定义
static const char* G_PRELU_TYPE = "PReLU";
static const char* G_PRELU_NAME = "PReLU_TRT";
宏函数
#define CUDA_KERNEL_LOOP(i, n) \for (int i = blockIdx.x * blockDim.x + threadIdx.x; \i < (n); \i += blockDim.x * gridDim.x)
核相关定义
static const int CUDA_NUM_THREADS = 512;// CUDA: number of blocks for threads.
inline int CAFFE_GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
class PReLUPlugin详解
私有成员
输入的CHW,值,类型及GPU端拷贝
int mNbInputChannels, mNbInputHeight, mNbInputWidth;
nvinfer1::Weights mWeights;
nvinfer1::DataType mDataType{
nvinfer1::DataType::kFLOAT};
void* mDeviceKernel{
nullptr};
PReLUPlugin(const nvinfer1::Weights* weights, int nbWeight);
这个构造函数的作用是将此层的参数(权重,偏置等参数)读取到内部
将模型文件参数的值复制给私有成员
PReLUPlugin::PReLUPlugin(const nvinfer1::Weights *weights, int nbWeights) {
mWeights = weights[0];mWeights.values = malloc(mWeights.count * type2size(mWeights.type));memcpy(const_cast<void *>(mWeights.values), weights[0].values, mWeights.count * type2size(mWeights.type));
}
PReLUPlugin::PReLUPlugin(const void *data, size_t length)
从序列化模型文件里读取数据
将参数的CHW,nvinfer1::Weights结构体内的变量copy到私有成员
最后assert内存大小是否相同的
PReLUPlugin::PReLUPlugin(const void *data, size_t length) {
const char *d = static_cast<const char *>(data), *a = d;read<int>(d, mNbInputChannels);read<int>(d, mNbInputHeight);read<int>(d, mNbInputWidth);read<nvinfer1::DataType>(d, mDataType);read<int64_t>(d, mWeights.count);read<nvinfer1::DataType>(d, mWeights.type);mWeights.values = nullptr;mWeights.values = malloc(mWeights.count * type2size(mWeights.type));memcpy(const_cast<void *>(mWeights.values), d, mWeights.count * type2size(mWeights.type));d = d + mWeights.count * type2size(mWeights.type);ASSERT(d == a + length);
virtual size_t getSerializationSize() const override
返回下在buffer里占用的大小,其实就是读取序列化构造函数里面所有变量的内存大小
size_t PReLUPlugin::getSerializationSize() const {
return sizeof(mNbInputChannels) + sizeof(mNbInputWidth) + sizeof(mNbInputHeight) + sizeof(mDataType) + sizeof(mWeights.count) + sizeof(mWeights.type) + mWeights.count * type2size(mWeights.type);
}
virtual void serialize(void* buffer) const override;
序列化插件到buffer 给构造函数相反一个读一个写,读写的参数相同
void PReLUPlugin::serialize(void *buffer) const {
char *d = static_cast<char *>(buffer), *a = d;write(d, mNbInputChannels);write(d, mNbInputHeight);write(d, mNbInputWidth);write(d, mDataType);write(d, mWeights.count);write(d, mWeights.type);convertAndCopyToBuffer(d, mWeights, mWeights.type);ASSERT(d == a + getSerializationSize());
}
PReLUPlugin() = delete;
无参数构造函数无意义
~PReLUPlugin();
析构函数
释放你申请的空间防止段错误
PReLUPlugin::~PReLUPlugin() {
if (mWeights.values) {
free(const_cast<void *>(mWeights.values));mWeights.values = nullptr;}if (mDeviceKernel) {
cudaFree(mDeviceKernel);mDeviceKernel = nullptr;}
}
virtual int getNbOutputs() const override;
返回输出tensor的数量, 比如说prelu,输出个数跟relu一样是1,这个取决于你的自定义层,大概是输入一个变成几个,待研究,后续补充
int PReLUPlugin::getNbOutputs() const {
return 1;
}
virtual nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override
返回输出tensor的维度,根据输入维度和操作自行计算
index这边有疑问?待解决
nvinfer1::Dims PReLUPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) {
if(index == 0) {
return nvinfer1::Dims3(inputs[0].d[0],inputs[0].d[1],inputs[0].d[2]);} // else if(index == n) {
// for other outputs if exists.// }else {
ASSERT(false);}
}
virtual bool supportsFormat(const nvinfer1::DataType type, nvinfer1::PluginFormat format) const override;
看看你支持什么格式,half,int
bool PReLUPlugin::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const {
return (type == nvinfer1::DataType::kFLOAT | type == nvinfer1::DataType::kHALF) && format == nvinfer1::PluginFormat::kNCHW;
}
virtual void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims,int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override;
判断下支持的格式,并给私有成员赋值
void PReLUPlugin::configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims, int nbOutputs,nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) {
ASSERT((type == nvinfer1::DataType::kFLOAT | type == nvinfer1::DataType::kHALF)&& format == nvinfer1::PluginFormat::kNCHW);mNbInputChannels = inputDims[0].d[0]; mNbInputHeight = inputDims[0].d[1];mNbInputWidth = inputDims[0].d[2];mDataType = type;
}
virtual int initialize() override
初始化你的插件,其实就是初始化好gpu context, 将你的权重从内存拷贝到gpu上,如果设定了fp16,当然也要先做转化在拷贝到gpu
int PReLUPlugin::initialize() {
convertAndCopyToDeivce(mDeviceKernel, mWeights, mDataType);return 0;
}
virtual void terminate() override
释放内存和显存,给析构函数一样,只不过调用对象不同
void PReLUPlugin::terminate() {
if (mWeights.values){
free(const_cast<void *>(mWeights.values));mWeights.values = nullptr;}if (mDeviceKernel){
cudaFree(mDeviceKernel);mDeviceKernel = nullptr;}
}
virtual size_t getWorkspaceSize(int maxBatchSize) const override
很难解释, 直接返回0即可
size_t PReLUPlugin::getWorkspaceSize(int maxBatchSize) const
{
return 0;
}
virtual const char* getPluginType() const override
tensorrt内部定义的
const char *PReLUPlugin::getPluginType() const {
return G_PRELU_TYPE;
}
virtual const char* getPluginVersion() const override;
tensorrt内部定义的
const char *PReLUPlugin::getPluginVersion() const {
return G_PLUGIN_VERSION;
}
virtual void destroy()
tensorrt内部接口来析构
void PReLUPlugin::destroy() {
delete this;
}
virtual nvinfer1::IPluginV2* clone() const override;
nvinfer1::IPluginV2* PReLUPlugin::clone() const {
return new PReLUPlugin(&mWeights, 1);
}
virtual void setPluginNamespace(const char* pluginNamespace) override {}
不用实现留空
virtual const char* getPluginNamespace() const override
const char* PReLUPlugin::getPluginNamespace() const {
return G_PLUGIN_NAMESPACE;
}
virtual int enqueue(int batchSize, const voidconst * inputs, void** outputs, void workspace, cudaStream_t stream) override
此层功能的代码实现,此处调用推理函数
int PReLUPlugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace, cudaStream_t stream)
{
const int count = batchSize * mNbInputChannels * mNbInputWidth * mNbInputHeight;const int channels = mNbInputChannels;const int dim = mNbInputWidth * mNbInputHeight;const int div_factor = 1;if (mDataType == nvinfer1::DataType::kFLOAT){
const float zerof{
0.0f};CUDA_CHECK(Forward_gpu(count, channels, dim,reinterpret_cast<const float *>(mDeviceKernel),reinterpret_cast<const float *>(inputs[0]),reinterpret_cast<float *>(outputs[0]),zerof,div_factor,stream));} else {
const __half zeroh = __half(0.0f);CUDA_CHECK(Forward_gpu(count, channels, dim,reinterpret_cast<const __half *>(mDeviceKernel),reinterpret_cast<const __half *>(inputs[0]),reinterpret_cast<__half *>(outputs[0]),zeroh,div_factor,stream));}return 0;
}
核函数调用
template <typename Ftype>
cudaError_t Forward_gpu(const int count, const int channels, const int dim,const Ftype* mDeviceKernel,const Ftype* bottom_data, Ftype* top_data, const Ftype zero,const int div_factor, const cudaStream_t stream) {
PReLUForward<<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(count, channels, dim, mDeviceKernel, bottom_data, top_data, zero, div_factor);cudaError_t err = cudaGetLastError();return err;
}
template <typename Ftype>
__global__ void PReLUForward(const int n, const int channels, const int dim,const Ftype* slope_data,const Ftype* in, Ftype* out,const Ftype zero,const int div_factor) {
CUDA_KERNEL_LOOP(index, n) {
int c = (index / dim) % channels / div_factor;if(in[index] > zero) {
out[index] = in[index];} else {
out[index] = in[index] * slope_data[c];}}
}