为什么即使使用异步流，cudaMemcpyAsync和内核启动也会阻塞？

Question

考虑以下程序，以便在非阻塞GPU流上排队一些工作：

#include <iostream>

using clock_value_t = long long;

__device__ void gpu_sleep(clock_value_t sleep_cycles) {
    clock_value_t start = clock64();
    clock_value_t cycles_elapsed;
    do { cycles_elapsed = clock64() - start; }
    while (cycles_elapsed < sleep_cycles);
}

void callback(cudaStream_t, cudaError_t, void *ptr) { 
    *(reinterpret_cast<bool *>(ptr)) = true; 
}

__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }

int main() {
    const clock_value_t duration_in_clocks = 1e6;
    const size_t buffer_size = 1e7;
    bool callback_executed = false;
    cudaStream_t stream;
    auto host_ptr = std::unique_ptr<char[]>(new char[buffer_size]);
    char* device_ptr;
    cudaMalloc(&device_ptr, buffer_size);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    cudaMemcpyAsync(device_ptr, host_ptr.get(), buffer_size, cudaMemcpyDefault, stream);
    dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
    cudaMemcpyAsync(host_ptr.get(), device_ptr, buffer_size, cudaMemcpyDefault, stream);
    cudaStreamAddCallback(
        stream, callback, &callback_executed, 0 /* fixed and meaningless */);
    snapshot = callback_executed;
    std::cout << "Right after we finished enqueuing work, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
    cudaStreamSynchronize(stream);
    snapshot = callback_executed;
    std::cout << "After cudaStreamSynchronize, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}

缓冲区的大小和内核在周期中的睡眠长度足够高，当它们与CPU线程并行执行时，它应该在它们结束之前完成入队（8ms + 8ms用于复制，20ms用于复制）内核）。

然而，看看下面的曲线，似乎两个cudaMemcpyAsync()实际上是同步的，即它们阻塞直到（非阻塞）流实际上已完成复制。这是预期的行为吗？它似乎收缩了relevant section的CUDA Runtime API documentation。它有什么意义？

跟踪:(编号行，使用时间）：

      1 "Start"        "Duration"    "Grid X"                             "Grid Y"  "Grid Z"    "Block X"   "Block Y"                       "Block Z"  
    104 14102.830000   59264.347000  "cudaMalloc"
    105 73368.351000   19.886000     "cudaStreamCreateWithFlags"
    106 73388.850000   8330.257000   "cudaMemcpyAsync"
    107 73565.702000   8334.265000   47.683716                            5.587311  "Pageable"  "Device"    "GeForce GTX 650 Ti BOOST (0)"  "1"        
    108 81721.124000   2.394000      "cudaConfigureCall"
    109 81723.865000   3.585000      "cudaSetupArgument"
    110 81729.332000   30.742000     "cudaLaunch (dummy(__int64) [107])"
    111 81760.604000   39589.422000  "cudaMemcpyAsync"
    112 81906.303000   20157.648000  128                                  1         1           128         1                               1          
    113 102073.103000  18736.208000  47.683716                            2.485355  "Device"    "Pageable"  "GeForce GTX 650 Ti BOOST (0)"  "1"        
    114 121351.936000  5.560000      "cudaStreamSynchronize"

Answer 1

另一答案

为什么即使使用异步流，cudaMemcpyAsync和内核启动也会阻塞？

2. API synchronization behavior