cuda cpu 函数 - gpu 内核重叠

Posted 2023-04-15

技术标签:

【中文标题】cuda cpu 函数 - gpu 内核重叠【英文标题】：cuda cpu function - gpu kernel overlap 【发布时间】：2014-08-23 21:22:40 【问题描述】：

我在尝试开发以练习 CUDA 的 CUDA 应用程序中存在并发问题。我想通过使用 cudaMemecpyAsync 和 CUDA 内核的异步行为在 GPU 和 CPU 之间共享工作，但我无法成功地重叠 CPU 执行和 GPU 执行。

它与主机到设备的数据传输重叠，但内核执行不重叠。它基本上是等待 CPU 完成并调用同步函数，然后内核开始在设备上执行。我无法理解这种行为，内核不是总是与 CPU 线程异步吗？

我的 GPU 是 Nvidia Geforce GT 550m（带有 1 个复制引擎和 1 个计算引擎的 Fermi 架构）。

我使用 CUDA 6.0 和 Nsight 4.0。

代码如下：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdlib.h>
#include <stdio.h>

#include <iostream>
#include <thread>
#include <chrono>
using namespace std;

struct point4D 

    float x;
    float y;
    float z;
    float w;
;

void heterogenous_1way_plus(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC);

bool correct_output(point4D * data, unsigned int size);
void flush_buffer(point4D * data, unsigned int size);
void initialize_input(point4D *& data, unsigned int size);
void cudaCheckError(cudaError_t cudaStatus, char* err);

// Implements cross product for 4D point on the GPU-side.
__global__ void gpu_kernel(point4D * d_ptrData, point4D * d_out, point4D pB, point4D pC)

    int index = blockIdx.x * blockDim.x + threadIdx.x;
    point4D pA = d_ptrData[index];
    point4D out; out.x = 0; out.y = 0; out.z = 0; out.w = 0;

    out.x +=  pA.y*(pB.z*pC.w - pC.z*pB.w) - pA.z*(pB.y*pC.w - pC.y*pB.w) + pA.w*(pB.y*pC.z - pC.y*pB.z);
    out.y += -pA.x*(pB.z*pC.w - pC.z*pB.w) + pA.z*(pB.x*pC.w - pC.x*pB.w) - pA.w*(pB.x*pC.z - pC.x*pB.z);
    out.z +=  pA.x*(pB.y*pC.w - pC.y*pB.w) - pA.y*(pB.x*pC.w - pC.x*pB.w) + pA.w*(pB.x*pC.y - pC.x*pB.y);
    out.w += -pA.x*(pB.y*pC.z - pC.y*pB.z) + pA.y*(pB.x*pC.z - pC.x*pB.z) - pA.z*(pB.x*pC.y - pC.x*pB.y);

   d_out[index] = out;


// Implements cross product for 4D point on the CPU-size.
void cpu_function(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC)

    for(unsigned int index = 0; index < h_dataSize; index++)
    
        h_out[index].x = 0; h_out[index].y = 0; h_out[index].z = 0; h_out[index].w = 0;

        point4D pA = h_ptrData[index];

        h_out[index].x +=  pA.y*(pB.z*pC.w - pC.z*pB.w) - pA.z*(pB.y*pC.w - pC.y*pB.w) + pA.w*(pB.y*pC.z - pC.y*pB.z);
        h_out[index].y += -pA.x*(pB.z*pC.w - pC.z*pB.w) + pA.z*(pB.x*pC.w - pC.x*pB.w) - pA.w*(pB.x*pC.z - pC.x*pB.z);
        h_out[index].z +=  pA.x*(pB.y*pC.w - pC.y*pB.w) - pA.y*(pB.x*pC.w - pC.x*pB.w) + pA.w*(pB.x*pC.y - pC.x*pB.y);
        h_out[index].w += -pA.x*(pB.y*pC.z - pC.y*pB.z) + pA.y*(pB.x*pC.z - pC.x*pB.z) - pA.z*(pB.x*pC.y - pC.x*pB.y);
       



int main(int argc, char *argv[])

    int devID;
    cudaDeviceProp deviceProps;

    printf("[%s] - Starting...\n", argv[0]);

    int device_count;
    cudaCheckError(cudaGetDeviceCount(&device_count), "Couldn't get device count!");

    if (device_count == 0)
    
        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    

    devID = 0;
    cudaCheckError(cudaSetDevice(devID), "Couldn't set device!");
    cudaCheckError(cudaGetDeviceProperties(&deviceProps, devID), "Couldn't get Device Properties");
    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProps.name, deviceProps.major, deviceProps.minor);

    cudaDeviceReset();

    const unsigned int DATA_SIZE = 30000000;
    bool bFinalResults = true;

    // Input Data Initialization
    point4D pointB;
    pointB.x = 1; pointB.y = 1; pointB.z = 0; pointB.w = 0;

    point4D pointC;
    pointC.x = 1; pointC.y = 1; pointC.z = 1; pointC.w = 0;

    point4D * data = (point4D*) malloc(DATA_SIZE * sizeof(point4D));
    point4D * out_points = (point4D*) malloc(DATA_SIZE * sizeof(point4D));
    initialize_input(data, DATA_SIZE);
    //

    flush_buffer(out_points, DATA_SIZE);
    cout << endl << endl;

    // 1+way
    heterogenous_1way_plus(data, DATA_SIZE, out_points, pointB, pointC);
    bFinalResults &= correct_output(out_points, DATA_SIZE); // checking correctness

    free(out_points);
    free(data);

    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
    return 0;


void heterogenous_1way_plus(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC)

    cout << "1-way_plus: STARTS!!!" << endl;

    // Run the %25 of the data from CPU, rest will be executed on GPU
    unsigned int ratioPercentCPUtoGPU = 25;
    unsigned int d_dataSize = (h_dataSize * (100 - ratioPercentCPUtoGPU))/100;
    h_dataSize = (h_dataSize * ratioPercentCPUtoGPU)/100;
    size_t memorySize = d_dataSize * sizeof(point4D);

    cout << "Data Ratio Between CPU and GPU:" << (float)ratioPercentCPUtoGPU/100 << endl;
    cout << "CPU will process " << h_dataSize << " data." << endl;
    cout << "GPU will process " << d_dataSize << " data." << endl;

    // registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    cudaCheckError(cudaHostRegister(h_ptrData, memorySize, cudaHostRegisterPortable), "cudaHostRegister failed!");
    cudaCheckError(cudaHostRegister(h_out, memorySize, cudaHostRegisterPortable), "cudaHostRegister failed!");

    // allocate device memory
    point4D * d_in = 0; point4D * d_out = 0;
    cudaCheckError(cudaMalloc( (void **)&d_in, memorySize), "cudaMalloc failed!");
    cudaCheckError(cudaMalloc( (void **)&d_out, memorySize), "cudaMalloc failed!");

    // set kernel launch configuration
    dim3 nThreads = dim3(1000,1);
    dim3 nBlocks = dim3(d_dataSize / nThreads.x,1);

    cout << "GPU Kernel Configuration : " << endl;
    cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
    cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;

    // create cuda stream
    cudaStream_t stream;
    cudaCheckError(cudaStreamCreate(&stream), "cudaStreamCreate failed!");

    // create cuda event handles
    cudaEvent_t start, stop;
    cudaCheckError(cudaEventCreate(&start), "cudaEventCreate failed!");
    cudaCheckError(cudaEventCreate(&stop), "cudaEventCreate failed!");

    // main thread waits for device
    cudaCheckError(cudaDeviceSynchronize(), "cudaDeviceSynchronize failed!");
    float gpu_time = 0.0f;
    cudaEventRecord(start, stream);

    cudaMemcpyAsync(d_in, h_ptrData, memorySize, cudaMemcpyHostToDevice, stream);       
    gpu_kernel<<<nBlocks, nThreads, 0, stream>>>(d_in, d_out, pB, pC);
    cudaMemcpyAsync(h_out, d_out, memorySize, cudaMemcpyDeviceToHost, stream);

    cudaEventRecord(stop, stream);

    // The memory layout of CPU processing starts after GPU's.
    cpu_function(h_ptrData + d_dataSize, h_dataSize, h_out + d_dataSize, pB, pC);       

    cudaCheckError(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed!");

    cudaCheckError(cudaEventElapsedTime(&gpu_time, start, stop), "cudaEventElapsedTime failed!");

    cudaCheckError(cudaDeviceSynchronize(), "cudaDeviceSynchronize failed!");

    // release resources
    cudaCheckError(cudaEventDestroy(start), "cudaEventDestroy failed!");
    cudaCheckError(cudaEventDestroy(stop), "cudaEventDestroy failed!");
    cudaCheckError(cudaHostUnregister(h_ptrData), "cudaHostUnregister failed!");
    cudaCheckError(cudaHostUnregister(h_out), "cudaHostUnregister failed!");
    cudaCheckError(cudaFree(d_in), "cudaFree failed!");
    cudaCheckError(cudaFree(d_out), "cudaFree failed!");
    cudaCheckError(cudaStreamDestroy(stream), "cudaStreamDestroy failed!");

    cudaDeviceReset();    

    cout << "Execution of GPU: " << gpu_time << "ms" << endl;
    cout << "1-way_plus: ENDS!!!" << endl;        


// Checks correctness of outputs
bool correct_output(point4D * data, unsigned int size)
 
    const static float x = 0, y = 0, z = 0, w = -1;

    for (unsigned int i = 0; i < size; i++)
    
        if (data[i].x != x || data[i].y != y ||
            data[i].z != y || data[i].w != w)
        
            printf("Error! data[%d] = [%f, %f, %f, %f], ref = [%f, %f, %f, %f]\n",
            i, data[i].x, data[i].y, data[i].z, data[i].w, x, y, z, w);

            return 0;
        
    
    return 1;


// Refresh the output buffer
void flush_buffer(point4D * data, unsigned int size)

    for(unsigned int i = 0; i < size; i++)
    
        data[i].x = 0; data[i].y = 0; data[i].z = 0; data[i].w = 0;
    


// Initialize the input data to feed the system for simulation
void initialize_input(point4D *& data, unsigned int size)

    for(unsigned int idx = 0; idx < size; idx++)
    
        point4D* d = &data[idx];
        d->x = 1;
        d->y = 0;
        d->z = 0;
        d->w = 0;
    


void cudaCheckError(cudaError_t cudaStatus, char* err)

    if(cudaStatus != cudaSuccess)
    
        fprintf(stderr, err);
        cudaDeviceReset();
       exit(EXIT_FAILURE);

这是 Nsight 截图：

【问题讨论】：

由于 *** 上的点数不足，我无法添加 Nsight 屏幕截图：/ 我建议您提供一个完整的示例，其他人可以复制、粘贴、编译和运行。如果你remove the cudaEventRecord() operations 会发生什么？ @RobertCrovella，感谢您的建议，我添加了完整的代码。不幸的是，删除 cudaEventRecord() 并没有解决任何问题。 【参考方案1】：

根据我在您的分析器图像上看到的内容，您得到了适当的重叠。我运行了你的代码，看到了类似的东西。

一般来说，你代码中的关键顺序是这样的：

cudaMemcpyAsyncH2D 内核调用 cudaMemcpyAsyncD2H CPU 功能 cudaStreamSynchronize

CPU 线程按顺序处理这些步骤。步骤 1-3 是异步的，这意味着控制权立即返回给 CPU 线程，无需等待底层 CUDA 操作完成。并且您希望步骤 4 与步骤 1、2 和 3 尽可能重叠。

我们看到cudaStreamSynchronize() 调用出现在时间轴上，与内核执行的开始大致一致。这意味着所有 CPU 线程活动在cudaStreamSynchronize() 调用之前都已完成（即大约在实际内核执行开始时）。因此我们希望与步骤 1-3 重叠的 cpu 函数（步骤 4）实际上是在步骤 2 开始时完成的（就实际的 CUDA 执行而言）。因此，您的 cpu 功能与第一个 host->device memcpy 操作完全重叠。

所以它按预期工作。因为cudaStreamSynchronize() 调用会阻塞 CPU 线程，直到所有流活动完成，它会占用从遇到它到流活动完成的时间线。

cudaStreamSynchronize() 调用与内核执行的开始奇怪地同时发生，并且在 H2D memcpy 结束和内核开始之间存在间隙这一事实可能是由于 WDDM 批处理命令.当我在 linux 下分析您的代码时，我看不到差距和确切的巧合，但除此之外，一般流程是相同的。这是我在 linux 下使用可视化分析器看到的：

请注意，在上图中，cudaStreamSynchronize() 实际上是在期间在内核开始之前的 H2D memcpy 操作中遇到的。

针对 cmets 中的一个问题，我修改了应用，使拆分百分比为 50 而不是 25：

unsigned int ratioPercentCPUtoGPU = 50;

这是新的分析器输出的样子：

我们看到 CPU 相对于 GPU 内核调用花费了更多时间，因此 CPU 线程直到在 D2H memcpy 操作期间才遇到cudaStreamSynchronize() 调用。我们在linux下继续看到，这个点和内核开始执行没有固定的关系。现在 CPU 执行与 H2D memcpy、内核执行和一小部分 D2H memcpy 完全重叠。

【讨论】：

好的，如果我理解正确，您说的是 WDDM。我心中的问题是，当我的 cpu 功能在 H2D 数据传输之前完成时，我有时会得到类似的重叠模式；但是当它比 H2D 传输花费更长的时间时，它开始推迟内核执行。据我在您的分析器中看到的，您的 cpu 执行在内核之前结束。您可以通过在heterogenous_1way_plus() 函数中增加ratioPercentCPUtoGPU 来再次检查吗？顺便说一句，感谢您的快速答复你说的完全正确，今天我检查了link和link，我尝试在cudaEventRecord(stop, stream)之后添加cudaEventQuery(stop)，它解决了我的问题。

以上是关于cuda cpu 函数 - gpu 内核重叠的主要内容，如果未能解决你的问题，请参考以下文章