0_Simple__asyncAPI

Posted 2020-10-11 爨爨爨好

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了0_Simple__asyncAPI相关的知识，希望对你有一定的参考价值。

关于CPU - GPU交互的简单接口函数。

源代码：

  1 //
  2 // This sample illustrates the usage of CUDA events for both GPU timing and
  3 // overlapping CPU and GPU execution.  Events are inserted into a stream
  4 // of CUDA calls.  Since CUDA stream calls are asynchronous, the CPU can
  5 // perform computations while GPU is executing (including DMA memcopies
  6 // between the host and device).  CPU can query CUDA events to determine
  7 // whether GPU has completed tasks.
  8 //
  9 
 10 // includes, system
 11 #include <stdio.h>
 12 
 13 // includes CUDA Runtime
 14 #include <cuda_runtime.h>
 15 #include "device_launch_parameters.h"
 16 
 17 // includes, project
 18 #include <D:\Program\CUDA\Samples\common\inc\helper_cuda.h>
 19 #include <D:\Program\CUDA\Samples\common\inc\helper_functions.h> // helper utility functions 
 20 
 21 __global__ void increment_kernel(int *g_data, int inc_value)
 22 {
 23     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 24     g_data[idx] = g_data[idx] + inc_value;
 25 }
 26 
 27 bool correct_output(int *data, const int n, const int x)
 28 {
 29     for (int i = 0; i < n; i++)
 30     {
 31         if (data[i] != x)
 32         {
 33             printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
 34             return false;
 35         }
 36     }
 37     return true;
 38 }
 39 
 40 int main(int argc, char *argv[])
 41 {
 42     int devID;
 43     cudaDeviceProp deviceProps;
 44 
 45     printf("[%s] - Starting...\n", argv[0]);
 46 
 47     // This will pick the best possible CUDA capable device
 48     devID = findCudaDevice(argc, (const char **)argv);
 49 
 50     // get device name
 51     checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
 52     printf("CUDA device [%s]\n", deviceProps.name);
 53 
 54     int n = 16 * 1024 * 1024;
 55     int nbytes = n * sizeof(int);
 56     int value = 26;
 57 
 58     // allocate host memory
 59     int *a = 0;
 60     checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
 61     memset(a, 0, nbytes);
 62 
 63     // allocate device memory
 64     int *d_a = 0;
 65     checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
 66     checkCudaErrors(cudaMemset(d_a, 255, nbytes));
 67 
 68     // set kernel launch configuration
 69     dim3 threads = dim3(512, 1);
 70     dim3 blocks = dim3(n / threads.x, 1);
 71 
 72     // create cuda event handles
 73     cudaEvent_t start, stop;
 74     checkCudaErrors(cudaEventCreate(&start));
 75     checkCudaErrors(cudaEventCreate(&stop));
 76 
 77     StopWatchInterface *timer = NULL;
 78     sdkCreateTimer(&timer);
 79     sdkResetTimer(&timer);
 80 
 81     checkCudaErrors(cudaDeviceSynchronize());
 82     float gpu_time = 0.0f;
 83 
 84     // asynchronously issue work to the GPU (all to stream 0)
 85     sdkStartTimer(&timer);
 86     cudaEventRecord(start, 0);
 87     cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
 88     increment_kernel << <blocks, threads, 0, 0 >> >(d_a, value);
 89     cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
 90     cudaEventRecord(stop, 0);
 91     sdkStopTimer(&timer);
 92 
 93     // have CPU do some work while waiting for stage 1 to finish
 94     unsigned long int counter = 0;
 95 
 96     while (cudaEventQuery(stop) == cudaErrorNotReady)
 97         counter++;
 98 
 99     checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
100 
101     // print the cpu and gpu times
102     printf("time spent executing by the GPU: %.2f\n", gpu_time);
103     printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
104     printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
105 
106     // check the output for correctness
107     bool bFinalResults = correct_output(a, n, value);
108 
109     // release resources
110     checkCudaErrors(cudaEventDestroy(start));
111     checkCudaErrors(cudaEventDestroy(stop));
112     checkCudaErrors(cudaFreeHost(a));
113     checkCudaErrors(cudaFree(d_a));
114 
115     getchar();
116     exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
117 }

? 输出结果：

[D:\Code\CUDA\ProjectCUDA\x64\Debug\ProjectCUDA.exe] - Starting...
GPU Device 0: "GeForce GTX 1070" with compute capability 6.1

CUDA device [GeForce GTX 1070]
time spent executing by the GPU: 11.65
time spent by CPU in CUDA calls: 0.04
CPU executed 5161 iterations while waiting for GPU to finish

? 新姿势：

● 调用主函数时的第0个参数作为程序名字符串，可以用于输出。

1 int main(int argc, char *argv[])
2 ...
3 printf("%s", argv[0]);

● 寻找最合适的CUDA设备？

inline int findCudaDevice(int argc, const char **argv)

　　使用中，直接使用主函数的参数，返回设备编号

int devID = findCudaDevice(argc, (const char **)argv);

● 在没有附加flag的情况下申请主机内存，注意使用cudaFreeHost释放

1 int *a, nbytes = n * sizeof(int);
2 cudaMallocHost((void **)&a, nbytes);
3 ...
4 cudaFreeHost(a);

● 记录CPU调用CUDA所用的时间

1  StopWatchInterface *timer = NULL;
2 sdkCreateTimer(&timer);
3 sdkResetTimer(&timer);
4 sdkStartTimer(&timer);
5     
6 ...// 核函数调用
7     
8 sdkStopTimer(&timer);
9 printf("%.2f ms", sdkGetTimerValue(&timer));

● 查看GPU队列状态的函数

extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);

　　使用中，stop为放置到流中的一个事件，cudaEventQuery(stop)返回时间的状态，等于cudaSuccess（宏，值等于0）表示已经发生；等于cudaErrorNotReady（宏，值等于35）表示尚未发生。源代码中利用这段时间让CPU空转，记录了迭代次数。

while (cudaEventQuery(stop) == cudaErrorNotReady) counter++;

● <stdlib.h>中关于返回成功和失败的宏

1 #define EXIT_SUCCESS 0
2 #define EXIT_FAILURE  1

以上是关于0_Simple__asyncAPI的主要内容，如果未能解决你的问题，请参考以下文章

0_Simple__cppIntegration

0_Simple__cppOverload

0_Simple__cudaOpenMP

0_Simple__simpleCallback

0_Simple__cdpSimpleQuicksort

0_Simple__clock