cuda cpu 函数 - gpu 内核重叠
Posted
技术标签:
【中文标题】cuda cpu 函数 - gpu 内核重叠【英文标题】:cuda cpu function - gpu kernel overlap 【发布时间】:2014-08-23 21:22:40 【问题描述】:我在尝试开发以练习 CUDA 的 CUDA 应用程序中存在并发问题。我想通过使用 cudaMemecpyAsync 和 CUDA 内核的异步行为在 GPU 和 CPU 之间共享工作,但我无法成功地重叠 CPU 执行和 GPU 执行。
它与主机到设备的数据传输重叠,但内核执行不重叠。它基本上是等待 CPU 完成并调用同步函数,然后内核开始在设备上执行。我无法理解这种行为,内核不是总是与 CPU 线程异步吗?
我的 GPU 是 Nvidia Geforce GT 550m(带有 1 个复制引擎和 1 个计算引擎的 Fermi 架构)。
我使用 CUDA 6.0 和 Nsight 4.0。
代码如下:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <thread>
#include <chrono>
using namespace std;
struct point4D
float x;
float y;
float z;
float w;
;
void heterogenous_1way_plus(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC);
bool correct_output(point4D * data, unsigned int size);
void flush_buffer(point4D * data, unsigned int size);
void initialize_input(point4D *& data, unsigned int size);
void cudaCheckError(cudaError_t cudaStatus, char* err);
// Implements cross product for 4D point on the GPU-side.
__global__ void gpu_kernel(point4D * d_ptrData, point4D * d_out, point4D pB, point4D pC)
int index = blockIdx.x * blockDim.x + threadIdx.x;
point4D pA = d_ptrData[index];
point4D out; out.x = 0; out.y = 0; out.z = 0; out.w = 0;
out.x += pA.y*(pB.z*pC.w - pC.z*pB.w) - pA.z*(pB.y*pC.w - pC.y*pB.w) + pA.w*(pB.y*pC.z - pC.y*pB.z);
out.y += -pA.x*(pB.z*pC.w - pC.z*pB.w) + pA.z*(pB.x*pC.w - pC.x*pB.w) - pA.w*(pB.x*pC.z - pC.x*pB.z);
out.z += pA.x*(pB.y*pC.w - pC.y*pB.w) - pA.y*(pB.x*pC.w - pC.x*pB.w) + pA.w*(pB.x*pC.y - pC.x*pB.y);
out.w += -pA.x*(pB.y*pC.z - pC.y*pB.z) + pA.y*(pB.x*pC.z - pC.x*pB.z) - pA.z*(pB.x*pC.y - pC.x*pB.y);
d_out[index] = out;
// Implements cross product for 4D point on the CPU-size.
void cpu_function(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC)
for(unsigned int index = 0; index < h_dataSize; index++)
h_out[index].x = 0; h_out[index].y = 0; h_out[index].z = 0; h_out[index].w = 0;
point4D pA = h_ptrData[index];
h_out[index].x += pA.y*(pB.z*pC.w - pC.z*pB.w) - pA.z*(pB.y*pC.w - pC.y*pB.w) + pA.w*(pB.y*pC.z - pC.y*pB.z);
h_out[index].y += -pA.x*(pB.z*pC.w - pC.z*pB.w) + pA.z*(pB.x*pC.w - pC.x*pB.w) - pA.w*(pB.x*pC.z - pC.x*pB.z);
h_out[index].z += pA.x*(pB.y*pC.w - pC.y*pB.w) - pA.y*(pB.x*pC.w - pC.x*pB.w) + pA.w*(pB.x*pC.y - pC.x*pB.y);
h_out[index].w += -pA.x*(pB.y*pC.z - pC.y*pB.z) + pA.y*(pB.x*pC.z - pC.x*pB.z) - pA.z*(pB.x*pC.y - pC.x*pB.y);
int main(int argc, char *argv[])
int devID;
cudaDeviceProp deviceProps;
printf("[%s] - Starting...\n", argv[0]);
int device_count;
cudaCheckError(cudaGetDeviceCount(&device_count), "Couldn't get device count!");
if (device_count == 0)
fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
devID = 0;
cudaCheckError(cudaSetDevice(devID), "Couldn't set device!");
cudaCheckError(cudaGetDeviceProperties(&deviceProps, devID), "Couldn't get Device Properties");
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProps.name, deviceProps.major, deviceProps.minor);
cudaDeviceReset();
const unsigned int DATA_SIZE = 30000000;
bool bFinalResults = true;
// Input Data Initialization
point4D pointB;
pointB.x = 1; pointB.y = 1; pointB.z = 0; pointB.w = 0;
point4D pointC;
pointC.x = 1; pointC.y = 1; pointC.z = 1; pointC.w = 0;
point4D * data = (point4D*) malloc(DATA_SIZE * sizeof(point4D));
point4D * out_points = (point4D*) malloc(DATA_SIZE * sizeof(point4D));
initialize_input(data, DATA_SIZE);
//
flush_buffer(out_points, DATA_SIZE);
cout << endl << endl;
// 1+way
heterogenous_1way_plus(data, DATA_SIZE, out_points, pointB, pointC);
bFinalResults &= correct_output(out_points, DATA_SIZE); // checking correctness
free(out_points);
free(data);
exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
return 0;
void heterogenous_1way_plus(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC)
cout << "1-way_plus: STARTS!!!" << endl;
// Run the %25 of the data from CPU, rest will be executed on GPU
unsigned int ratioPercentCPUtoGPU = 25;
unsigned int d_dataSize = (h_dataSize * (100 - ratioPercentCPUtoGPU))/100;
h_dataSize = (h_dataSize * ratioPercentCPUtoGPU)/100;
size_t memorySize = d_dataSize * sizeof(point4D);
cout << "Data Ratio Between CPU and GPU:" << (float)ratioPercentCPUtoGPU/100 << endl;
cout << "CPU will process " << h_dataSize << " data." << endl;
cout << "GPU will process " << d_dataSize << " data." << endl;
// registers host memory as page-locked (required for asynch cudaMemcpyAsync)
cudaCheckError(cudaHostRegister(h_ptrData, memorySize, cudaHostRegisterPortable), "cudaHostRegister failed!");
cudaCheckError(cudaHostRegister(h_out, memorySize, cudaHostRegisterPortable), "cudaHostRegister failed!");
// allocate device memory
point4D * d_in = 0; point4D * d_out = 0;
cudaCheckError(cudaMalloc( (void **)&d_in, memorySize), "cudaMalloc failed!");
cudaCheckError(cudaMalloc( (void **)&d_out, memorySize), "cudaMalloc failed!");
// set kernel launch configuration
dim3 nThreads = dim3(1000,1);
dim3 nBlocks = dim3(d_dataSize / nThreads.x,1);
cout << "GPU Kernel Configuration : " << endl;
cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;
// create cuda stream
cudaStream_t stream;
cudaCheckError(cudaStreamCreate(&stream), "cudaStreamCreate failed!");
// create cuda event handles
cudaEvent_t start, stop;
cudaCheckError(cudaEventCreate(&start), "cudaEventCreate failed!");
cudaCheckError(cudaEventCreate(&stop), "cudaEventCreate failed!");
// main thread waits for device
cudaCheckError(cudaDeviceSynchronize(), "cudaDeviceSynchronize failed!");
float gpu_time = 0.0f;
cudaEventRecord(start, stream);
cudaMemcpyAsync(d_in, h_ptrData, memorySize, cudaMemcpyHostToDevice, stream);
gpu_kernel<<<nBlocks, nThreads, 0, stream>>>(d_in, d_out, pB, pC);
cudaMemcpyAsync(h_out, d_out, memorySize, cudaMemcpyDeviceToHost, stream);
cudaEventRecord(stop, stream);
// The memory layout of CPU processing starts after GPU's.
cpu_function(h_ptrData + d_dataSize, h_dataSize, h_out + d_dataSize, pB, pC);
cudaCheckError(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed!");
cudaCheckError(cudaEventElapsedTime(&gpu_time, start, stop), "cudaEventElapsedTime failed!");
cudaCheckError(cudaDeviceSynchronize(), "cudaDeviceSynchronize failed!");
// release resources
cudaCheckError(cudaEventDestroy(start), "cudaEventDestroy failed!");
cudaCheckError(cudaEventDestroy(stop), "cudaEventDestroy failed!");
cudaCheckError(cudaHostUnregister(h_ptrData), "cudaHostUnregister failed!");
cudaCheckError(cudaHostUnregister(h_out), "cudaHostUnregister failed!");
cudaCheckError(cudaFree(d_in), "cudaFree failed!");
cudaCheckError(cudaFree(d_out), "cudaFree failed!");
cudaCheckError(cudaStreamDestroy(stream), "cudaStreamDestroy failed!");
cudaDeviceReset();
cout << "Execution of GPU: " << gpu_time << "ms" << endl;
cout << "1-way_plus: ENDS!!!" << endl;
// Checks correctness of outputs
bool correct_output(point4D * data, unsigned int size)
const static float x = 0, y = 0, z = 0, w = -1;
for (unsigned int i = 0; i < size; i++)
if (data[i].x != x || data[i].y != y ||
data[i].z != y || data[i].w != w)
printf("Error! data[%d] = [%f, %f, %f, %f], ref = [%f, %f, %f, %f]\n",
i, data[i].x, data[i].y, data[i].z, data[i].w, x, y, z, w);
return 0;
return 1;
// Refresh the output buffer
void flush_buffer(point4D * data, unsigned int size)
for(unsigned int i = 0; i < size; i++)
data[i].x = 0; data[i].y = 0; data[i].z = 0; data[i].w = 0;
// Initialize the input data to feed the system for simulation
void initialize_input(point4D *& data, unsigned int size)
for(unsigned int idx = 0; idx < size; idx++)
point4D* d = &data[idx];
d->x = 1;
d->y = 0;
d->z = 0;
d->w = 0;
void cudaCheckError(cudaError_t cudaStatus, char* err)
if(cudaStatus != cudaSuccess)
fprintf(stderr, err);
cudaDeviceReset();
exit(EXIT_FAILURE);
这是 Nsight 截图:
【问题讨论】:
由于 *** 上的点数不足,我无法添加 Nsight 屏幕截图:/ 我建议您提供一个完整的示例,其他人可以复制、粘贴、编译和运行。如果你remove the cudaEventRecord() operations 会发生什么? @RobertCrovella,感谢您的建议,我添加了完整的代码。不幸的是,删除cudaEventRecord()
并没有解决任何问题。
【参考方案1】:
根据我在您的分析器图像上看到的内容,您得到了适当的重叠。我运行了你的代码,看到了类似的东西。
一般来说,你代码中的关键顺序是这样的:
-
cudaMemcpyAsyncH2D
内核调用
cudaMemcpyAsyncD2H
CPU 功能
cudaStreamSynchronize
CPU 线程按顺序处理这些步骤。步骤 1-3 是异步的,这意味着控制权立即返回给 CPU 线程,无需等待底层 CUDA 操作完成。并且您希望步骤 4 与步骤 1、2 和 3 尽可能重叠。
我们看到cudaStreamSynchronize()
调用出现在时间轴上,与内核执行的开始 大致一致。这意味着所有 CPU 线程活动在cudaStreamSynchronize()
调用之前都已完成(即大约在实际内核执行开始时)。因此我们希望与步骤 1-3 重叠的 cpu 函数(步骤 4)实际上是在步骤 2 开始时完成的(就实际的 CUDA 执行而言)。因此,您的 cpu 功能与第一个 host->device memcpy 操作完全重叠。
所以它按预期工作。因为cudaStreamSynchronize()
调用会阻塞 CPU 线程,直到所有流活动完成,它会占用从遇到它到流活动完成的时间线。
cudaStreamSynchronize()
调用与内核执行的开始奇怪地同时发生,并且在 H2D memcpy 结束和内核开始之间存在间隙这一事实可能是由于 WDDM 批处理命令.当我在 linux 下分析您的代码时,我看不到差距和确切的巧合,但除此之外,一般流程是相同的。这是我在 linux 下使用可视化分析器看到的:
请注意,在上图中,cudaStreamSynchronize()
实际上是在期间在内核开始之前的 H2D memcpy 操作中遇到的。
针对 cmets 中的一个问题,我修改了应用,使拆分百分比为 50 而不是 25:
unsigned int ratioPercentCPUtoGPU = 50;
这是新的分析器输出的样子:
我们看到 CPU 相对于 GPU 内核调用花费了更多时间,因此 CPU 线程直到在 D2H memcpy 操作期间才遇到cudaStreamSynchronize()
调用。我们在linux下继续看到,这个点和内核开始执行没有固定的关系。现在 CPU 执行与 H2D memcpy、内核执行和一小部分 D2H memcpy 完全重叠。
【讨论】:
好的,如果我理解正确,您说的是 WDDM。我心中的问题是,当我的 cpu 功能在 H2D 数据传输之前完成时,我有时会得到类似的重叠模式;但是当它比 H2D 传输花费更长的时间时,它开始推迟内核执行。据我在您的分析器中看到的,您的 cpu 执行在内核之前结束。您可以通过在heterogenous_1way_plus()
函数中增加ratioPercentCPUtoGPU
来再次检查吗?顺便说一句,感谢您的快速答复
你说的完全正确,今天我检查了link和link,我尝试在cudaEventRecord(stop, stream)
之后添加cudaEventQuery(stop)
,它解决了我的问题。以上是关于cuda cpu 函数 - gpu 内核重叠的主要内容,如果未能解决你的问题,请参考以下文章
GPUNvidia CUDA 编程中级教程——在多个 GPU上实现数据复制与计算的重叠
GPUNvidia CUDA 编程中级教程——在多个 GPU上实现数据复制与计算的重叠