为什么即使使用异步流,cudaMemcpyAsync和内核启动也会阻塞?

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了为什么即使使用异步流,cudaMemcpyAsync和内核启动也会阻塞?相关的知识,希望对你有一定的参考价值。

考虑以下程序,以便在非阻塞GPU流上排队一些工作:

#include <iostream>

using clock_value_t = long long;

__device__ void gpu_sleep(clock_value_t sleep_cycles) {
    clock_value_t start = clock64();
    clock_value_t cycles_elapsed;
    do { cycles_elapsed = clock64() - start; }
    while (cycles_elapsed < sleep_cycles);
}

void callback(cudaStream_t, cudaError_t, void *ptr) { 
    *(reinterpret_cast<bool *>(ptr)) = true; 
}

__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }

int main() {
    const clock_value_t duration_in_clocks = 1e6;
    const size_t buffer_size = 1e7;
    bool callback_executed = false;
    cudaStream_t stream;
    auto host_ptr = std::unique_ptr<char[]>(new char[buffer_size]);
    char* device_ptr;
    cudaMalloc(&device_ptr, buffer_size);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    cudaMemcpyAsync(device_ptr, host_ptr.get(), buffer_size, cudaMemcpyDefault, stream);
    dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
    cudaMemcpyAsync(host_ptr.get(), device_ptr, buffer_size, cudaMemcpyDefault, stream);
    cudaStreamAddCallback(
        stream, callback, &callback_executed, 0 /* fixed and meaningless */);
    snapshot = callback_executed;
    std::cout << "Right after we finished enqueuing work, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
    cudaStreamSynchronize(stream);
    snapshot = callback_executed;
    std::cout << "After cudaStreamSynchronize, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}

缓冲区的大小和内核在周期中的睡眠长度足够高,当它们与CPU线程并行执行时,它应该在它们结束之前完成入队(8ms + 8ms用于复制,20ms用于复制)内核)。

然而,看看下面的曲线,似乎两个cudaMemcpyAsync()实际上是同步的,即它们阻塞直到(非阻塞)流实际上已完成复制。这是预期的行为吗?它似乎收缩了relevant sectionCUDA Runtime API documentation。它有什么意义?


跟踪:(编号行,使用时间):

      1 "Start"        "Duration"    "Grid X"                             "Grid Y"  "Grid Z"    "Block X"   "Block Y"                       "Block Z"  
    104 14102.830000   59264.347000  "cudaMalloc"
    105 73368.351000   19.886000     "cudaStreamCreateWithFlags"
    106 73388.850000   8330.257000   "cudaMemcpyAsync"
    107 73565.702000   8334.265000   47.683716                            5.587311  "Pageable"  "Device"    "GeForce GTX 650 Ti BOOST (0)"  "1"        
    108 81721.124000   2.394000      "cudaConfigureCall"
    109 81723.865000   3.585000      "cudaSetupArgument"
    110 81729.332000   30.742000     "cudaLaunch (dummy(__int64) [107])"
    111 81760.604000   39589.422000  "cudaMemcpyAsync"
    112 81906.303000   20157.648000  128                                  1         1           128         1                               1          
    113 102073.103000  18736.208000  47.683716                            2.485355  "Device"    "Pageable"  "GeForce GTX 650 Ti BOOST (0)"  "1"        
    114 121351.936000  5.560000      "cudaStreamSynchronize"
答案

这看起来很奇怪,所以我联系了CUDA驱动程序团队的人,他确认文档是正确的。我也能够确认一下:

#include <iostream>
#include <memory>

using clock_value_t = long long;

__device__ void gpu_sleep(clock_value_t sleep_cycles) {
    clock_value_t start = clock64();
    clock_value_t cycles_elapsed;
    do { cycles_elapsed = clock64() - start; }
    while (cycles_elapsed < sleep_cycles);
}

void callback(cudaStream_t, cudaError_t, void *ptr) { 
    *(reinterpret_cast<bool *>(ptr)) = true; 
}

__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }

int main(int argc, char* argv[]) {
  cudaFree(0);
  struct timespec start, stop;
    const clock_value_t duration_in_clocks = 1e6;
    const size_t buffer_size = 2 * 1024 * 1024 * (size_t)1024;
    bool callback_executed = false;
    cudaStream_t stream;
    void* host_ptr;
    if (argc == 1){
      host_ptr = malloc(buffer_size);
    }
    else {
      cudaMallocHost(&host_ptr, buffer_size, 0);
    }
    char* device_ptr;
    cudaMalloc(&device_ptr, buffer_size);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
    cudaMemcpyAsync(device_ptr, host_ptr, buffer_size, cudaMemcpyDefault, stream);
    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
    double result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
    std::cout << "Elapsed: " << result / 1000 / 1000<< std::endl;

    dummy<<<128, 128, 0, stream>>>(duration_in_clocks);

    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
    cudaMemcpyAsync(host_ptr, device_ptr, buffer_size, cudaMemcpyDefault, stream);
    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
    result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
    std::cout << "Elapsed: " << result / 1000 / 1000 << std::endl;

    cudaStreamAddCallback(
        stream, callback, &callback_executed, 0 /* fixed and meaningless */);
    auto snapshot = callback_executed;
    std::cout << "Right after we finished enqueuing work, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
    cudaStreamSynchronize(stream);
    snapshot = callback_executed;
    std::cout << "After cudaStreamSynchronize, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}

这基本上是您的代码,只需进行一些修改:

  • 时间测量
  • 从可分页或固定内存分配的开关
  • 缓冲区大小为2 GiB,以确保可测量的复制时间
  • cudaFree(0)迫使CUDA懒惰初始化。

结果如下:

$ nvcc -std=c++11 main.cu -lrt

$ ./a.out # using pageable memory
Elapsed: 0.360828 # (memcpyDtoH pageable -> device, fully async)
Elapsed: 5.20288 # (memcpyHtoD device -> pageable, sync)

$ ./a.out 1 # using pinned memory
Elapsed: 4.412e-06 # (memcpyDtoH pinned -> device, fully async)
Elapsed: 7.127e-06 # (memcpyDtoH device -> pinned, fully async)

从可分页复制到设备时速度较慢,但​​它实际上是异步的。

对不起,我很抱歉。我删除了之前的评论以避免让人感到困惑。

另一答案

事实上,正如@RobinThoni所指出的那样,CUDA内存副本只能在严格条件下“异步”异步。对于有问题的代码,问题主要是使用未固定(即分页)的主机内存。

引用Runtime API文档的另一部分(强调我的):

2. API synchronization behavior

API以同步和异步形式提供memcpy / memset函数,后者具有“Async”后缀。这是一个误称,因为每个函数可能表现出同步或异步行为,具体取决于传递给函数的参数。

...

异步

  • 对于从设备存储器到可分页主机存储器的传输,该功能仅在复制完成后返回。

那就是它的一半!实际上,这是真的

  • 对于从可分页主机存储器到设备存储器的传输,数据将首先在固定主机存储器中暂存,然后复制到设备;并且只有在分段发生后才会返回该功能。

以上是关于为什么即使使用异步流,cudaMemcpyAsync和内核启动也会阻塞?的主要内容,如果未能解决你的问题,请参考以下文章

SyntaxError:“等待”外部函数,即使它在异步内部

为啥我们需要中间件用于 Redux 中的异步流?

为啥我们需要中间件用于 Redux 中的异步流?

Kotlin 协程Flow 异步流 ② ( 使用 Flow 异步流持续获取不同返回值 | Flow 异步流获取返回值方式与其它方式对比 | 在 Android 中使用 Flow 异步流下载文件 )

Kotlin 协程Flow 异步流 ② ( 使用 Flow 异步流持续获取不同返回值 | Flow 异步流获取返回值方式与其它方式对比 | 在 Android 中使用 Flow 异步流下载文件 )

异步队列,文件流结束如何知道两者何时完成