0_Simple__asyncAPI
Posted 爨爨爨好
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了0_Simple__asyncAPI相关的知识,希望对你有一定的参考价值。
关于CPU - GPU交互的简单接口函数。
源代码:
1 // 2 // This sample illustrates the usage of CUDA events for both GPU timing and 3 // overlapping CPU and GPU execution. Events are inserted into a stream 4 // of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can 5 // perform computations while GPU is executing (including DMA memcopies 6 // between the host and device). CPU can query CUDA events to determine 7 // whether GPU has completed tasks. 8 // 9 10 // includes, system 11 #include <stdio.h> 12 13 // includes CUDA Runtime 14 #include <cuda_runtime.h> 15 #include "device_launch_parameters.h" 16 17 // includes, project 18 #include <D:\Program\CUDA\Samples\common\inc\helper_cuda.h> 19 #include <D:\Program\CUDA\Samples\common\inc\helper_functions.h> // helper utility functions 20 21 __global__ void increment_kernel(int *g_data, int inc_value) 22 { 23 int idx = blockIdx.x * blockDim.x + threadIdx.x; 24 g_data[idx] = g_data[idx] + inc_value; 25 } 26 27 bool correct_output(int *data, const int n, const int x) 28 { 29 for (int i = 0; i < n; i++) 30 { 31 if (data[i] != x) 32 { 33 printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); 34 return false; 35 } 36 } 37 return true; 38 } 39 40 int main(int argc, char *argv[]) 41 { 42 int devID; 43 cudaDeviceProp deviceProps; 44 45 printf("[%s] - Starting...\n", argv[0]); 46 47 // This will pick the best possible CUDA capable device 48 devID = findCudaDevice(argc, (const char **)argv); 49 50 // get device name 51 checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); 52 printf("CUDA device [%s]\n", deviceProps.name); 53 54 int n = 16 * 1024 * 1024; 55 int nbytes = n * sizeof(int); 56 int value = 26; 57 58 // allocate host memory 59 int *a = 0; 60 checkCudaErrors(cudaMallocHost((void **)&a, nbytes)); 61 memset(a, 0, nbytes); 62 63 // allocate device memory 64 int *d_a = 0; 65 checkCudaErrors(cudaMalloc((void **)&d_a, nbytes)); 66 checkCudaErrors(cudaMemset(d_a, 255, nbytes)); 67 68 // set kernel launch configuration 69 dim3 threads = dim3(512, 1); 70 dim3 blocks = dim3(n / threads.x, 1); 71 72 // create cuda event handles 73 cudaEvent_t start, stop; 74 checkCudaErrors(cudaEventCreate(&start)); 75 checkCudaErrors(cudaEventCreate(&stop)); 76 77 StopWatchInterface *timer = NULL; 78 sdkCreateTimer(&timer); 79 sdkResetTimer(&timer); 80 81 checkCudaErrors(cudaDeviceSynchronize()); 82 float gpu_time = 0.0f; 83 84 // asynchronously issue work to the GPU (all to stream 0) 85 sdkStartTimer(&timer); 86 cudaEventRecord(start, 0); 87 cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0); 88 increment_kernel << <blocks, threads, 0, 0 >> >(d_a, value); 89 cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0); 90 cudaEventRecord(stop, 0); 91 sdkStopTimer(&timer); 92 93 // have CPU do some work while waiting for stage 1 to finish 94 unsigned long int counter = 0; 95 96 while (cudaEventQuery(stop) == cudaErrorNotReady) 97 counter++; 98 99 checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop)); 100 101 // print the cpu and gpu times 102 printf("time spent executing by the GPU: %.2f\n", gpu_time); 103 printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer)); 104 printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter); 105 106 // check the output for correctness 107 bool bFinalResults = correct_output(a, n, value); 108 109 // release resources 110 checkCudaErrors(cudaEventDestroy(start)); 111 checkCudaErrors(cudaEventDestroy(stop)); 112 checkCudaErrors(cudaFreeHost(a)); 113 checkCudaErrors(cudaFree(d_a)); 114 115 getchar(); 116 exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE); 117 }
? 输出结果:
[D:\Code\CUDA\ProjectCUDA\x64\Debug\ProjectCUDA.exe] - Starting... GPU Device 0: "GeForce GTX 1070" with compute capability 6.1 CUDA device [GeForce GTX 1070] time spent executing by the GPU: 11.65 time spent by CPU in CUDA calls: 0.04 CPU executed 5161 iterations while waiting for GPU to finish
? 新姿势:
● 调用主函数时的第0个参数作为程序名字符串,可以用于输出。
1 int main(int argc, char *argv[]) 2 ... 3 printf("%s", argv[0]);
● 寻找最合适的CUDA设备?
inline int findCudaDevice(int argc, const char **argv)
使用中,直接使用主函数的参数,返回设备编号
int devID = findCudaDevice(argc, (const char **)argv);
● 在没有附加flag的情况下申请主机内存,注意使用cudaFreeHost释放
1 int *a, nbytes = n * sizeof(int); 2 cudaMallocHost((void **)&a, nbytes); 3 ... 4 cudaFreeHost(a);
● 记录CPU调用CUDA所用的时间
1 StopWatchInterface *timer = NULL; 2 sdkCreateTimer(&timer); 3 sdkResetTimer(&timer); 4 sdkStartTimer(&timer); 5 6 ...// 核函数调用 7 8 sdkStopTimer(&timer); 9 printf("%.2f ms", sdkGetTimerValue(&timer));
● 查看GPU队列状态的函数
extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);
使用中,stop为放置到流中的一个事件,cudaEventQuery(stop)返回时间的状态,等于cudaSuccess(宏,值等于0)表示已经发生;等于cudaErrorNotReady(宏,值等于35)表示尚未发生。源代码中利用这段时间让CPU空转,记录了迭代次数。
while (cudaEventQuery(stop) == cudaErrorNotReady) counter++;
● <stdlib.h>中关于返回成功和失败的宏
1 #define EXIT_SUCCESS 0 2 #define EXIT_FAILURE 1
以上是关于0_Simple__asyncAPI的主要内容,如果未能解决你的问题,请参考以下文章