CUDA流表示一个GPU操作队列,并且该队列中的操作以添加到队列的先后顺序执行。使用CUDA流可以实现任务级的并行,比如当GPU在执行核函数的同时,还可以在主机和设备之间交换数据(前提是GPU支持重叠,property的deviceOverlay为true)。
cudaMemcpyAsync函数的功能是在GPU和主机之间复制数据。它是一个异步函数,即函数被调用后,只是放置一个请求,表示在流中执行一次内存复制操作。函数返回时,复制操作不一定启动或执行结束,只是该操作被放入执行队列,在下一个被放入流中的操作之前执行。
实验通过把一组数据分块复制到GPU执行,返回执行结果,来说明使用cuda流的使用能提高程序的执行效率。原理主要是使数据复制操作和核函数执行操作交叉执行,不用等到第一次核函数执行结束再开始第二轮的数据复制,以减少顺序执行带来的延迟(类似于编译中使用流水线在解决冲突的前提下提高效率)。
程序代码如下:
#include "cuda_runtime.h"#include "cutil_inline.h"#include <stdio.h>#include <math.h>static void HandleError( cudaError_t err,const char *file,int line ) { if (err != cudaSuccess) { printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); exit( EXIT_FAILURE ); }}#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))#define N (1024*1024)#define FULL_DATA_SIZE N*20__global__ void kernel(int* a, int *b, int*c){ int idx = blockIdx.x * blockDim.x + threadIdx.x; int offset = gridDim.x * blockDim.x; if (idx < N) { int idx1 = (idx + 1) % 256; int idx2 = (idx + 2) % 256; float as = (a[idx] + a[idx1] + a[idx2]) / 3; float bs = (b[idx] + b[idx1] + b[idx2]) / 3; c[idx] = (as + bs) / 2; }}int main(){ cudaDeviceProp prop; int devID; HANDLE_ERROR(cudaGetDevice(&devID)); HANDLE_ERROR(cudaGetDeviceProperties(&prop, devID)); if (!prop.deviceOverlap) { printf("No device will handle overlaps. so no speed up from stream.\n"); return 0; } cudaEvent_t start, stop; float elapsedTime; HANDLE_ERROR(cudaEventCreate(&start)); HANDLE_ERROR(cudaEventCreate(&stop)); HANDLE_ERROR(cudaEventRecord(start, 0)); cudaStream_t stream0; cudaStream_t stream1; HANDLE_ERROR(cudaStreamCreate(&stream0)); HANDLE_ERROR(cudaStreamCreate(&stream1)); int *host_a, *host_b, *host_c; int *dev_a0, *dev_b0, *dev_c0; int *dev_a1, *dev_b1, *dev_c1; HANDLE_ERROR(cudaMalloc((void**)&dev_a0, N*sizeof(int))); HANDLE_ERROR(cudaMalloc((void**)&dev_b0, N*sizeof(int))); HANDLE_ERROR(cudaMalloc((void**)&dev_c0, N*sizeof(int))); HANDLE_ERROR(cudaMalloc((void**)&dev_a1, N*sizeof(int))); HANDLE_ERROR(cudaMalloc((void**)&dev_b1, N*sizeof(int))); HANDLE_ERROR(cudaMalloc((void**)&dev_c1, N*sizeof(int))); HANDLE_ERROR(cudaHostAlloc((void**)&host_a, FULL_DATA_SIZE*sizeof(int), cudaHostAllocDefault)); HANDLE_ERROR(cudaHostAlloc((void**)&host_b, FULL_DATA_SIZE*sizeof(int), cudaHostAllocDefault)); HANDLE_ERROR(cudaHostAlloc((void**)&host_c, FULL_DATA_SIZE*sizeof(int), cudaHostAllocDefault)); for (int i=0; i<FULL_DATA_SIZE; i++) { host_a[i] = rand(); host_b[i] = rand(); } // tasks are put into stack for gpu execution for (int i=0; i<FULL_DATA_SIZE; i+=2*N) { HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a+i, N*sizeof(int), cudaMemcpyHostToDevice, stream0)); HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a+i+N, N*sizeof(int), cudaMemcpyHostToDevice, stream1)); HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b+i, N*sizeof(int), cudaMemcpyHostToDevice, stream0)); HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a+i+N, N*sizeof(int), cudaMemcpyHostToDevice, stream1)); kernel<<<N/256, 256, 0, stream0>>>(dev_a0, dev_b0, dev_c0); kernel<<<N/256, 256, 0, stream1>>>(dev_a1, dev_b1, dev_c1); HANDLE_ERROR(cudaMemcpyAsync(host_c+i, dev_c0, N*sizeof(int), cudaMemcpyDeviceToHost, stream0)); HANDLE_ERROR(cudaMemcpyAsync(host_c+i+N, dev_c1, N*sizeof(int), cudaMemcpyDeviceToHost, stream1));// HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a+i, N*sizeof(int), cudaMemcpyHostToDevice, stream0));// HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b+i, N*sizeof(int), cudaMemcpyHostToDevice, stream0));// kernel<<<N/256, 256, 0, stream0>>>(dev_a0, dev_b0, dev_c0);// HANDLE_ERROR(cudaMemcpyAsync(host_c+i, dev_c0, N*sizeof(int), cudaMemcpyDeviceToHost, stream0));// // HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a+i+N, N*sizeof(int), cudaMemcpyHostToDevice, stream1));// HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a+i+N, N*sizeof(int), cudaMemcpyHostToDevice, stream1));// kernel<<<N/256, 256, 0, stream1>>>(dev_a1, dev_b1, dev_c1);// HANDLE_ERROR(cudaMemcpyAsync(host_c+i+N, dev_c1, N*sizeof(int), cudaMemcpyDeviceToHost, stream1)); } // wait until gpu execution finish HANDLE_ERROR(cudaStreamSynchronize(stream0)); HANDLE_ERROR(cudaStreamSynchronize(stream1)); HANDLE_ERROR(cudaEventRecord(stop, 0)); HANDLE_ERROR(cudaEventSynchronize(stop)); HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop)); printf("Time taken: %3.1f ms\n", elapsedTime); // free stream and mem HANDLE_ERROR(cudaFreeHost(host_a)); HANDLE_ERROR(cudaFreeHost(host_b)); HANDLE_ERROR(cudaFreeHost(host_c)); HANDLE_ERROR(cudaFree(dev_a0)); HANDLE_ERROR(cudaFree(dev_b0)); HANDLE_ERROR(cudaFree(dev_c0)); HANDLE_ERROR(cudaFree(dev_a1)); HANDLE_ERROR(cudaFree(dev_b1)); HANDLE_ERROR(cudaFree(dev_c1)); HANDLE_ERROR(cudaStreamDestroy(stream0)); HANDLE_ERROR(cudaStreamDestroy(stream1)); return 0;}
相对于顺序执行,使用两个cuda流使程序的执行时间少了20ms(由于数据量不大,所以使用流的优势不太明显).