tensorrt cuda函数编写_综合

cu文件的编写

.h文件的编写
.cpp
.cu

cuda编程手册地址, cuda基础

.h文件的编写

#ifndef TINYTRT_DECODE_H
#define TINYTRT_DECODE_H
#include "Trt.h"
#include "utils.h"
#include "YoloLayerPlugin/YoloLayerPlugin.hpp"
using namespace std;
struct Bbox {
    int left, right, top, bottom;int clsId;float score;
};
cudaError_t decode_gpu(vector<float> input,YoloKernel yolo_kernel,vector<Detection>& output);
float my_decode(vector<float> intput,YoloKernel yolo_kernel,vector<Detection>& output);
#endif //TINYTRT_DECODE_H

可以看出decode.h只有两个函数一个是cpu段的解码,一个是gpu段的解码,可以将cpu段解码的代码写进decode.cu文件中,也可以单独写进decode.cpp中.

.cpp

#include "decode.h"
float my_Logist(float data){
     return 1./(1. + exp(-data)); }
float my_decode(vector<float> intput,YoloKernel yolo_kernel,vector<Detection>& output)
{
    YoloKernel yolo=yolo_kernel;int stride=yolo.width*yolo.height;for(int i=0;i<yolo.width*yolo.height;i++){
    for(int j=0;j<3;j++){
    int begin_id=7*stride*j+i;int obj_id=begin_id+4*stride;float obj_prob=my_Logist(intput[obj_id]);if(obj_prob<0.7)continue;int class_id=-1;float max_prob=0.7;for(int k=0;k<2;k++){
    float temp_prob=my_Logist(intput[begin_id+(5+k)*stride])*obj_prob;if(temp_prob>max_prob){
    class_id=k;max_prob=temp_prob;}if(class_id>=0){
    Detection det;int row=i/yolo.width;int cols=i%yolo.height;float a=my_Logist(intput[begin_id]);float b=my_Logist(intput[begin_id+stride]);det.bbox[0]=(cols+a)/yolo.width;det.bbox[1]=(row+b)/yolo.height;det.bbox[2]=exp(intput[begin_id+2*stride])*yolo.anchors[2*j];det.bbox[3]=exp(intput[begin_id+3*stride])*yolo.anchors[2*j+1];det.classId=class_id;det.prob=max_prob;output.emplace_back(det);}}}}
}

cpu端的解码没有什么可讲的,理解模型最后输出的是什么就可以.
模型最后的输出是一个batch*(channel*(box+1+class))yolo.wyolo.h的float数组,box指的是x,y,w,h.1代表的是这个anchor内有没有目标,class代表的是种类.

.cu

#include "decode.h"
#include "Trt.h"
#include "utils.h"
#include "math.h"
#include <iostream>
#include <fstream>
#include <cmath>
#include "../plugin/YoloLayerPlugin/YoloLayerPlugin.hpp"
__device__ float Logist1(float data){
     return 1./(1. + exp(-data)); };
__global__ void caldetection(const float* input,float* output,int noelements,int yolowidth,int yoloheight,const float anchors[6],int classes,int outputElem)
{
    int idx=threadIdx.x+blockDim.x*blockIdx.x;if(idx>noelements)return;int stride=yoloheight*yolowidth;int bnidx=idx/stride;int curidx=idx-bnidx*stride;const float* curinput=input+bnidx*(7)*stride*3;for(int k=0;k<3;k++){
    int beginidx=(7*stride)*k+curidx;int objidx=beginidx+stride*4;float objprob=Logist1(curinput[objidx]);if(objprob <= 0.7)continue;int row = curidx / yolowidth;int cols = curidx % yolowidth;int classId = -1;float maxProb = IGNORE_THRESH;for (int c = 0;c<2;++c){
    float cProb =  Logist1(curinput[beginidx + (5 + c) * stride]) * objprob;if(cProb > maxProb){
    maxProb = cProb;classId = c;}}if(classId >= 0) {
    float *curOutput = output + bnidx*outputElem;int resCount = (int)atomicAdd(curOutput,1);char* data = (char * )curOutput + sizeof(float) + resCount*sizeof(Detection);Detection* det =  (Detection*)(data);det->bbox[0] = (cols + Logist1(curinput[beginidx]))/ yolowidth;det->bbox[1] = (row + Logist1(curinput[beginidx+stride]))/ yoloheight;det->bbox[2] = exp(curinput[beginidx+2*stride]) * anchors[2*k];det->bbox[3] = exp(curinput[beginidx+3*stride]) * anchors[2*k + 1];float tem_cla=float(classId);det->classId = llround(double(tem_cla));det->prob = maxProb;}}}
cudaError_t decode_gpu(vector<float> input,YoloKernel yolo_kernel,vector<Detection>& output)
{
    float* temp_input;int input_num=input.size();CUDA_CHECK(cudaMalloc(&temp_input,input.size()*sizeof(float)));CUDA_CHECK(cudaMemcpy(temp_input,&input[0],input.size()*sizeof(float),cudaMemcpyHostToDevice));float* output1;void* devAnchor;size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen));int outputElem = 1;outputElem+=yolo_kernel.width*yolo_kernel.height*3*sizeof(Detection)/sizeof(float);CUDA_CHECK(cudaMalloc(&output1,sizeof(float)*outputElem));int numelem=yolo_kernel.width*yolo_kernel.height;CUDA_CHECK(cudaMemcpyAsync(devAnchor,yolo_kernel.anchors,AnchorLen,cudaMemcpyHostToDevice));caldetection<<<(yolo_kernel.width*yolo_kernel.height+512-1)/512,512>>>(temp_input,output1,numelem,yolo_kernel.width,yolo_kernel.height,(float *)devAnchor,2,outputElem);cudaError_t cudaStatus;cudaFree(devAnchor);float* out_host{
    };CUDA_CHECK(cudaMallocHost(&out_host,sizeof(float)*outputElem));CUDA_CHECK(cudaMemcpy(out_host,output1,sizeof(float)*outputElem,cudaMemcpyDeviceToHost));
// printf("第一个输出%f,第二个输出%f",out_host[0],out_host[1]);for(int k=0;k<int(out_host[0]);k++){
    Detection temp;temp.bbox[0]=out_host[6*k+1];temp.bbox[1]=out_host[6*k+2];temp.bbox[2]=out_host[6*k+3];temp.bbox[3]=out_host[6*k+4];temp.classId=out_host[6*k+5];temp.prob=out_host[6*k+6];output.push_back(temp);}cudaStatus = cudaGetLastError();if (cudaStatus != cudaSuccess) {
    std::cerr << "CUDA error " << cudaGetErrorString(cudaStatus) << " at " << __FILE__ << ":" << __LINE__ << std::endl;}return cudaGetLastError();

.cu文件中要注意,kernel函数中不能debug进去,也不能将中间结果输出到txt中查看,(目前水平是这样),因此只能靠printf调试,还是很麻烦的.
注意__device__,global,__host__的区别,__device__由gpu调用在gpu上执行,global__由cpu调用在GPU上执行.所以decode.cu的核函数为__global,再进去__global__函数前,要将cpu中的数据拷贝到gpu,还要在gpu上申请输出的空间块(这个空间要大于等于你所需要的空间)
这一块指针,空间申请之类的c++,cuda语法还要多多研究,先谈点现在简单的理解.

 float* temp_input;CUDA_CHECK(cudaMalloc(&temp_input,input.size()*sizeof(float)));CUDA_CHECK(cudaMemcpy(temp_input,&input[0],input.size()*sizeof(float),cudaMemcpyHostToDevice));void* devAnchor;size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen));CUDA_CHECK(cudaMemcpy(devAnchor,yolo_kernel.anchors,AnchorLen,cudaMemcpyHostToDevice));float* output1;int outputElem = 1;outputElem+=yolo_kernel.width*yolo_kernel.height*3*sizeof(Detection)/sizeof(float);CUDA_CHECK(cudaMalloc(&output1,sizeof(float)*outputElem));

关于cudaMalloc()和cudaMemcpy()函数的使用详见cuda编程手册.
注意核函数中要用到的所有host数据都要cpy到device在送入核函数.

float* out_host{
    };
CUDA_CHECK(cudaMallocHost(&out_host,sizeof(float)*outputElem));
CUDA_CHECK(cudaMemcpy(out_host,output1,sizeof(float)*outputElem,cudaMemcpyDeviceToHost));

这一步是将device中的值cpy到host处理,要不gpu中的值是不可见状态.

caldetection<<<(yolo_kernel.width*yolo_kernel.height+512-1)/512,512>>>(temp_input,output1,numelem,yolo_kernel.width,yolo_kernel.height,(float *)devAnchor,2,outputElem);

注意下kernel函数的调用,使用的是<<<block,thread>>>形式,其block,thread可以是一维,也可以是二维,每个的idx是确认好的.核函数简单理解就是同时进行很多for循环,这一部分还要多看多写.

cudaFree(devAnchor)
cudaFree(output1);
cudaFreeHost(out_host);

最后使用完要记得释放指针,避免报段错误.

int nByte=sizeof(float)*nElem;
float *res_h=(float*)malloc(nByte);
memset(res_h,0,nByte);
free(res_h);

常见的c++内存拷贝

float *curOutput = output + bnidx*outputElem;
int resCount = (int)atomicAdd(curOutput,1);
char* data = (char * )curOutput + sizeof(float) + resCount*sizeof(Detection);
Detection* det =  (Detection*)(data);
det->bbox[0] = (cols + Logist1(curinput[beginidx]))/ yolowidth;

注意下atomicADD()的使用,大概是用来多线程计数的.指针数组申请时一定要分配空间块大小及地址.还有强制类型转换的用法.